]> git.ipfire.org Git - thirdparty/unbound.git/commitdiff
verbosity and stats control.
authorWouter Wijngaards <wouter@nlnetlabs.nl>
Mon, 15 Sep 2008 14:19:41 +0000 (14:19 +0000)
committerWouter Wijngaards <wouter@nlnetlabs.nl>
Mon, 15 Sep 2008 14:19:41 +0000 (14:19 +0000)
git-svn-id: file:///svn/unbound/trunk@1237 be551aaa-1e26-0410-a405-d3ace91eadb9

daemon/remote.c
daemon/stats.c
daemon/stats.h
daemon/worker.c
daemon/worker.h
doc/Changelog
doc/control_proto_spec.txt
doc/unbound-control.8.in
smallapp/unbound-control.c

index 855e3b1758071fe5384771215a0e499415668b21..730d30f035b52bff3b0bf78726a557cec0024bd6 100644 (file)
@@ -46,6 +46,7 @@
 #include "daemon/remote.h"
 #include "daemon/worker.h"
 #include "daemon/daemon.h"
+#include "daemon/stats.h"
 #include "util/log.h"
 #include "util/config_file.h"
 #include "util/net_help.h"
@@ -73,6 +74,25 @@ log_crypto_err(const char* str)
        }
 }
 
+/** divide sum of timers to get average */
+static void
+timeval_divide(struct timeval* avg, struct timeval* sum, size_t d)
+{
+#ifndef S_SPLINT_S
+       size_t leftover;
+       if(d == 0) {
+               avg->tv_sec = 0;
+               avg->tv_usec = 0;
+               return;
+       }
+       avg->tv_sec = sum->tv_sec / d;
+       avg->tv_usec = sum->tv_usec / d;
+       /* handle fraction from seconds divide */
+       leftover = sum->tv_sec - avg->tv_sec*d;
+       avg->tv_usec += (leftover*1000000)/d;
+#endif
+}
+
 struct daemon_remote*
 daemon_remote_create(struct worker* worker)
 {
@@ -452,6 +472,16 @@ ssl_read_line(SSL* ssl, char* buf, size_t max)
        return 0;
 }
 
+/** skip whitespace, return new pointer into string */
+static char*
+skipwhite(char* str)
+{
+       /* EOS \0 is not a space */
+       while( isspace(*str) ) 
+               str++;
+       return str;
+}
+
 /** send the OK to the control client */
 static void send_ok(SSL* ssl)
 {
@@ -460,7 +490,7 @@ static void send_ok(SSL* ssl)
 
 /** do the stop command */
 static void
-do_stop(struct daemon_remote* rc, SSL* ssl)
+do_stop(SSL* ssl, struct daemon_remote* rc)
 {
        rc->worker->need_to_exit = 1;
        comm_base_exit(rc->worker->base);
@@ -469,25 +499,108 @@ do_stop(struct daemon_remote* rc, SSL* ssl)
 
 /** do the reload command */
 static void
-do_reload(struct daemon_remote* rc, SSL* ssl)
+do_reload(SSL* ssl, struct daemon_remote* rc)
 {
        rc->worker->need_to_exit = 0;
        comm_base_exit(rc->worker->base);
        send_ok(ssl);
 }
 
+/** do the verbosity command */
+static void
+do_verbosity(SSL* ssl, char* str)
+{
+       int val = atoi(str);
+       if(val == 0 && strcmp(str, "0") != 0) {
+               ssl_printf(ssl, "error in verbosity number syntax: %s\n", str);
+               return;
+       }
+       verbosity = val;
+       send_ok(ssl);
+}
+
+/** print stats from statinfo */
+static int
+print_stats(SSL* ssl, char* nm, struct stats_info* s)
+{
+       struct timeval avg;
+       if(!ssl_printf(ssl, "%s.num.queries: %u\n", nm, 
+               (unsigned)s->svr.num_queries)) return 0;
+       if(!ssl_printf(ssl, "%s.num.cachehits: %u\n", nm, 
+               (unsigned)(s->svr.num_queries 
+                       - s->svr.num_queries_missed_cache))) return 0;
+       if(!ssl_printf(ssl, "%s.num.cachemiss: %u\n", nm, 
+               (unsigned)s->svr.num_queries_missed_cache)) return 0;
+       if(!ssl_printf(ssl, "%s.num.recursivereplies: %u\n", nm, 
+               (unsigned)s->mesh_replies_sent)) return 0;
+       if(!ssl_printf(ssl, "%s.requestlist.avg: %g\n", nm,
+               s->svr.num_queries_missed_cache?
+                       (double)s->svr.sum_query_list_size/
+                       s->svr.num_queries_missed_cache : 0.0)) return 0;
+       if(!ssl_printf(ssl, "%s.requestlist.max: %u\n", nm,
+               (unsigned)s->svr.max_query_list_size)) return 0;
+       if(!ssl_printf(ssl, "%s.requestlist.overwritten: %u\n", nm,
+               (unsigned)s->mesh_jostled)) return 0;
+       if(!ssl_printf(ssl, "%s.requestlist.exceeded: %u\n", nm,
+               (unsigned)s->mesh_dropped)) return 0;
+       if(!ssl_printf(ssl, "%s.requestlist.current.all: %u\n", nm,
+               (unsigned)s->mesh_num_states)) return 0;
+       if(!ssl_printf(ssl, "%s.requestlist.current.user: %u\n", nm,
+               (unsigned)s->mesh_num_reply_states)) return 0;
+       timeval_divide(&avg, &s->mesh_replies_sum_wait, s->mesh_replies_sent);
+       if(!ssl_printf(ssl, "%s.recursion.time.avg: %d.%6.6d\n", nm,
+               (int)avg.tv_sec, (int)avg.tv_usec)) return 0;
+       if(!ssl_printf(ssl, "%s.recursion.time.median: %g\n", nm, 
+               s->mesh_time_median)) return 0;
+       return 1;
+}
+
+/** print stats for one thread */
+static int
+print_thread_stats(SSL* ssl, int i, struct stats_info* s)
+{
+       char nm[16];
+       snprintf(nm, sizeof(nm), "thread%d", i);
+       nm[sizeof(nm)-1]=0;
+       return print_stats(ssl, nm, s);
+}
+
+/** do the stats command */
+static void
+do_stats(SSL* ssl, struct daemon_remote* rc)
+{
+       struct daemon* daemon = rc->worker->daemon;
+       struct stats_info total;
+       struct stats_info s;
+       int i;
+       /* gather all thread statistics in one place */
+       for(i=0; i<daemon->num; i++) {
+               server_stats_obtain(rc->worker, daemon->workers[i], &s);
+               if(!print_thread_stats(ssl, i, &s))
+                       return;
+               if(i == 0)
+                       total = s;
+               else    server_stats_add(&total, &s);
+       }
+       /* print the thread statistics */
+       total.mesh_time_median /= (double)daemon->num;
+       print_stats(ssl, "total", &total);
+}
+
 /** execute a remote control command */
 static void
 execute_cmd(struct daemon_remote* rc, SSL* ssl, char* cmd)
 {
-       char* p = cmd;
-       /* skip whitespace */
-       while( isspace(*p) ) p++;
-       /* compare command - check longer strings first */
+       char* p = skipwhite(cmd);
+       /* compare command - check longer strings first in case of substrings*/
        if(strncmp(p, "stop", 4) == 0) {
-               do_stop(rc, ssl);
+               do_stop(ssl, rc);
        } else if(strncmp(p, "reload", 6) == 0) {
-               do_reload(rc, ssl);
+               do_reload(ssl, rc);
+       } else if(strncmp(p, "verbosity", 9) == 0) {
+               do_verbosity(ssl, skipwhite(p+9));
+       } else if(strncmp(p, "stats", 5) == 0) {
+               do_stats(ssl, rc);
        } else {
                (void)ssl_printf(ssl, "error unknown command '%s'\n", p);
        }
index 8c2b917b4b9d6dcfee1cbec1867c9ff88a2756de..8fa9d44b139f9cc767899e92f5dbfe4994a3b9e0 100644 (file)
 #include "config.h"
 #include "daemon/stats.h"
 #include "daemon/worker.h"
+#include "daemon/daemon.h"
 #include "services/mesh.h"
+#include "util/config_file.h"
+#include "util/tube.h"
+#include "util/timehist.h"
+
+/** add timers and the values do not overflow or become negative */
+static void
+timeval_add(struct timeval* d, struct timeval* add)
+{
+#ifndef S_SPLINT_S
+       d->tv_sec += add->tv_sec;
+       d->tv_usec += add->tv_usec;
+       while(d->tv_usec > 1000000 ) {
+               d->tv_usec -= 1000000;
+               d->tv_sec++;
+       }
+#endif
+}
 
 void server_stats_init(struct server_stats* stats)
 {
@@ -57,7 +75,8 @@ void server_stats_querymiss(struct server_stats* stats, struct worker* worker)
                stats->max_query_list_size = worker->env.mesh->all.count;
 }
 
-void server_stats_log(struct server_stats* stats, int threadnum)
+void server_stats_log(struct server_stats* stats, struct worker* worker,
+       int threadnum)
 {
        log_info("server stats for thread %d: %u queries, "
                "%u answers from cache, %u recursions", 
@@ -70,5 +89,77 @@ void server_stats_log(struct server_stats* stats, int threadnum)
                stats->num_queries_missed_cache?
                        (double)stats->sum_query_list_size/
                        stats->num_queries_missed_cache : 0.0,
-               (unsigned)stats->num_query_list_exceeded);
+               (unsigned)worker->env.mesh->stats_dropped);
+}
+
+void
+server_stats_compile(struct worker* worker, struct stats_info* s)
+{
+       s->svr = worker->stats;
+       s->mesh_num_states = worker->env.mesh->all.count;
+       s->mesh_num_reply_states = worker->env.mesh->num_reply_states;
+       s->mesh_jostled = worker->env.mesh->stats_jostled;
+       s->mesh_dropped = worker->env.mesh->stats_dropped;
+       s->mesh_replies_sent = worker->env.mesh->replies_sent;
+       s->mesh_replies_sum_wait = worker->env.mesh->replies_sum_wait;
+       s->mesh_time_median = timehist_quartile(worker->env.mesh->histogram,
+               0.50);
+       if(!worker->env.cfg->stat_cumulative) {
+               server_stats_init(&worker->stats);
+               mesh_stats_clear(worker->env.mesh);
+       }
+}
+
+void server_stats_obtain(struct worker* worker, struct worker* who,
+       struct stats_info* s)
+{
+       uint8_t *reply = NULL;
+       size_t len = 0;
+       if(worker == who) {
+               /* just fill it in */
+               server_stats_compile(worker, s);
+               return;
+       }
+       /* communicate over tube */
+       verbose(VERB_ALGO, "write stats cmd");
+       worker_send_cmd(who, worker_cmd_stats);
+       verbose(VERB_ALGO, "wait for stats reply");
+       if(!tube_read_msg(worker->cmd, &reply, &len, 0))
+               fatal_exit("failed to read stats over cmd channel");
+       if(len != sizeof(*s))
+               fatal_exit("stats on cmd channel wrong length %d %d",
+                       (int)len, (int)sizeof(*s));
+       memcpy(s, reply, len);
+       free(reply);
+}
+
+void server_stats_reply(struct worker* worker)
+{
+       struct stats_info s;
+       server_stats_compile(worker, &s);
+       verbose(VERB_ALGO, "write stats replymsg");
+       if(!tube_write_msg(worker->daemon->workers[0]->cmd, 
+               (uint8_t*)&s, sizeof(s), 0))
+               fatal_exit("could not write stat values over cmd channel");
+}
+
+void server_stats_add(struct stats_info* total, struct stats_info* a)
+{
+       total->svr.num_queries += a->svr.num_queries;
+       total->svr.num_queries_missed_cache += a->svr.num_queries_missed_cache;
+       total->svr.sum_query_list_size += a->svr.sum_query_list_size;
+       /* the max size reached is upped to higher of both */
+       if(a->svr.max_query_list_size > total->svr.max_query_list_size)
+               total->svr.max_query_list_size = a->svr.max_query_list_size;
+
+       total->mesh_num_states += a->mesh_num_states;
+       total->mesh_num_reply_states += a->mesh_num_reply_states;
+       total->mesh_jostled += a->mesh_jostled;
+       total->mesh_dropped += a->mesh_dropped;
+       total->mesh_replies_sent += a->mesh_replies_sent;
+       timeval_add(&total->mesh_replies_sum_wait, &a->mesh_replies_sum_wait);
+       /* the medians are averaged together, this is not as accurate as
+        * taking the median over all of the data, but is good and fast
+        * added up here, division later*/
+       total->mesh_time_median += a->mesh_time_median;
 }
index a3f2bc674b71c257e885e0b4b30a985f732a794b..cc0f6d91d68923e946ab1ecee9581c08200f13cf 100644 (file)
@@ -58,8 +58,30 @@ struct server_stats {
        size_t sum_query_list_size;
        /** max value of query list size reached. */
        size_t max_query_list_size;
-       /** number of times that the query_list_size was insufficient */
-       size_t num_query_list_exceeded;
+};
+
+/** 
+ * Statistics to send over the control pipe when asked
+ * This struct is made to be memcpied, sent in binary.
+ */
+struct stats_info {
+       /** the thread stats */
+       struct server_stats svr;
+
+       /** mesh stats: current number of states */
+       size_t mesh_num_states;
+       /** mesh stats: current number of reply (user) states */
+       size_t mesh_num_reply_states;
+       /** mesh stats: number of reply states overwritten with a new one */
+       size_t mesh_jostled;
+       /** mesh stats: number of incoming queries dropped */
+       size_t mesh_dropped;
+       /** mesh stats: replies sent */
+       size_t mesh_replies_sent;
+       /** mesh stats: sum of waiting times for the replies */
+       struct timeval mesh_replies_sum_wait;
+       /** mesh stats: median of waiting times for replies (in sec) */
+       double mesh_time_median;
 };
 
 /** 
@@ -72,6 +94,37 @@ void server_stats_init(struct server_stats* stats);
 void server_stats_querymiss(struct server_stats* stats, struct worker* worker);
 
 /** display the stats to the log */
-void server_stats_log(struct server_stats* stats, int threadnum);
+void server_stats_log(struct server_stats* stats, struct worker* worker,
+       int threadnum);
+
+/**
+ * Obtain the stats info for a given thread. Uses pipe to communicate.
+ * @param worker: the worker that is executing (the first worker).
+ * @param who: on who to get the statistics info.
+ * @param s: the stats block to fill in.
+ */
+void server_stats_obtain(struct worker* worker, struct worker* who,
+       struct stats_info* s);
+
+/**
+ * Compile stats into structure for this thread worker.
+ * Also clears the statistics counters (if that is set by config file).
+ * @param worker: the worker to compile stats for, also the executing worker.
+ * @param s: stats block.
+ */
+void server_stats_compile(struct worker* worker, struct stats_info* s);
+
+/**
+ * Send stats over comm tube in reply to query cmd
+ * @param worker: this worker.
+ */
+void server_stats_reply(struct worker* worker);
+
+/**
+ * Addup stat blocks.
+ * @param total: sum of the two entries.
+ * @param a: to add to it.
+ */
+void server_stats_add(struct stats_info* total, struct stats_info* a);
 
 #endif /* DAEMON_STATS_H */
index 83e0462ca92a90c624d10b97255e1e4320cf319e..6ca3bca2cb711d374eafab5c1cc8cd24e7630513 100644 (file)
@@ -194,7 +194,7 @@ worker_mem_report(struct worker* ATTR_UNUSED(worker),
 void 
 worker_send_cmd(struct worker* worker, enum worker_commands cmd)
 {
-       uint32_t c = (uint32_t)cmd;
+       uint32_t c = (uint32_t)htonl(cmd);
        if(!tube_write_msg(worker->cmd, (uint8_t*)&c, sizeof(c), 0)) {
                log_err("worker send cmd %d failed", (int)cmd);
        }
@@ -338,6 +338,10 @@ worker_handle_control_cmd(struct tube* ATTR_UNUSED(tube), uint8_t* msg,
                verbose(VERB_ALGO, "got control cmd quit");
                comm_base_exit(worker->base);
                break;
+       case worker_cmd_stats:
+               verbose(VERB_ALGO, "got control cmd stats");
+               server_stats_reply(worker);
+               break;
        default:
                log_err("bad command %d", (int)cmd);
                break;
@@ -831,7 +835,7 @@ worker_handle_request(struct comm_point* c, void* arg, int error,
                /* protect our memory usage from storing reply addresses */
                verbose(VERB_ALGO, "Too many requests queued. "
                        "dropping incoming query.");
-               worker->stats.num_query_list_exceeded++;
+               worker->env.mesh->stats_dropped++;
                comm_point_drop_reply(repinfo);
                return 0;
        }
@@ -895,7 +899,7 @@ worker_restart_timer(struct worker* worker)
 void worker_stat_timer_cb(void* arg)
 {
        struct worker* worker = (struct worker*)arg;
-       server_stats_log(&worker->stats, worker->thread_num);
+       server_stats_log(&worker->stats, worker, worker->thread_num);
        mesh_stats(worker->env.mesh, "mesh has");
        worker_mem_report(worker, NULL);
        if(!worker->daemon->cfg->stat_cumulative) {
@@ -921,12 +925,10 @@ worker_create(struct daemon* daemon, int id, int* ports, int n)
        }
        worker->daemon = daemon;
        worker->thread_num = id;
-       if(id != 0) {
-               if(!(worker->cmd = tube_create())) {
-                       free(worker->ports);
-                       free(worker);
-                       return NULL;
-               }
+       if(!(worker->cmd = tube_create())) {
+               free(worker->ports);
+               free(worker);
+               return NULL;
        }
        return worker;
 }
@@ -1011,14 +1013,12 @@ worker_init(struct worker* worker, struct config_file *cfg,
                worker_delete(worker);
                return 0;
        }
-       if(worker->thread_num != 0) {
-               /* start listening to commands */
-               if(!tube_setup_bg_listen(worker->cmd, worker->base,
-                       &worker_handle_control_cmd, worker)) {
-                       log_err("could not create control compt.");
-                       worker_delete(worker);
-                       return 0;
-               }
+       /* start listening to commands */
+       if(!tube_setup_bg_listen(worker->cmd, worker->base,
+               &worker_handle_control_cmd, worker)) {
+               log_err("could not create control compt.");
+               worker_delete(worker);
+               return 0;
        }
        worker->stat_timer = comm_timer_create(worker->base, 
                worker_stat_timer_cb, worker);
@@ -1082,7 +1082,7 @@ worker_delete(struct worker* worker)
        if(!worker) 
                return;
        if(worker->env.mesh && verbosity >= VERB_OPS) {
-               server_stats_log(&worker->stats, worker->thread_num);
+               server_stats_log(&worker->stats, worker, worker->thread_num);
                mesh_stats(worker->env.mesh, "mesh has");
                worker_mem_report(worker, NULL);
        }
index 25801d16a78b0cfb7fb7bb18f86d4b6cb101f2b3..6beb607f827c13f191f8593b1acacb6fa50b5feb 100644 (file)
@@ -64,7 +64,9 @@ struct daemon_remote;
 /** worker commands */
 enum worker_commands {
        /** make the worker quit */
-       worker_cmd_quit
+       worker_cmd_quit,
+       /** obtain statistics */
+       worker_cmd_stats
 };
 
 /**
index f14071c433169bd3d122635f68a1c89eb16a14a8..e09d766f09a3cc1f4d23e8fb8c72c4a367a830eb 100644 (file)
@@ -1,6 +1,8 @@
 15 September 2008: Wouter
        - working start, stop, reload commands for unbound-control.
        - test for unbound-control working; better exit value for control.
+       - verbosity control via unbound-control.
+       - unbound-control stats.
 
 12 September 2008: Wouter
        - removed browser control mentions. Proto speccy.
index cf9078622cdc6d6b994cd409dc3bef610ba689ef..1ad8d1cf999ca0daa0cee89811ba714c2279c51f 100644 (file)
@@ -25,6 +25,8 @@ stop
        stops the server.
 reload
        reloads the config file, and flushes the cache.
+verbosity <new value>
+       Change logging verbosity to new value.
 stats
        output is a list of [name]=[value] lines.
        clears the counters.
@@ -61,7 +63,5 @@ local_data <resource record string>
        local_data_add www.example.com. IN A 192.0.2.2
        if no local_zone exists for it; a transparent zone with the same
        name as the data is created.
-verbosity <new value>
-       Change logging verbosity to new value.
 
 
index d4ed17195fcb122347dd7bd858e3eef25f020b0c..92e71865b694fe38d65a42b28760a85dc7e49f13 100644 (file)
@@ -50,6 +50,16 @@ Stop the server. The server daemon exits.
 .TP
 .B reload
 Reload the server. This flushes the cache and reads the config file fresh.
+.TP
+.B verbosity \fInumber
+Change verbosity value for logging. Same values as \fBverbosity\fR keyword in
+\fIunbound.conf\fR(5).  This new setting lasts until the server is issued
+a reload (taken from config file again), or the next verbosity control command.
+.TP
+.B stats
+Print statistics. Resets the internal counters to zero, this can be 
+controlled using the \fBstatistics\-cumulative\fR config statement. 
+Statistics are printed with one [name]: [value] per line.
 .SH "EXIT CODE"
 The unbound-control program exits with status code 1 on error.
 .SH "SET UP"
index af34cb78c5f6e24bd6868186452b922b432cd680..b50410f73a8ab485b14cbafce0b39f97907b6b7f 100644 (file)
@@ -61,6 +61,8 @@ usage()
        printf("  start         start server; runs unbound(8)\n");
        printf("  stop          stops the server\n");
        printf("  reload        reloads the server\n");
+       printf("  stats         print statistics\n");
+       printf("  verbosity [number]    change logging detail\n");
        printf("Version %s\n", PACKAGE_VERSION);
        printf("BSD licensed, see LICENSE in source package for details.\n");
        printf("Report bugs to %s\n", PACKAGE_BUGREPORT);
@@ -118,6 +120,14 @@ contact_server(char* svr, struct config_file* cfg)
                if(cfg->control_ifs)
                        svr = cfg->control_ifs->str;
                else    svr = "127.0.0.1";
+               /* config 0 addr (everything), means ask localhost */
+               if(strcmp(svr, "0.0.0.0") == 0)
+                       svr = "127.0.0.1";
+               else if(strcmp(svr, "::0") == 0 ||
+                       strcmp(svr, "0::0") == 0 ||
+                       strcmp(svr, "0::") == 0 ||
+                       strcmp(svr, "::") == 0)
+                       svr = "::1";
        }
        if(strchr(svr, '@')) {
                if(!extstrtoaddr(svr, &addr, &addrlen))