From: Wouter Wijngaards Date: Mon, 30 Mar 2026 14:13:11 +0000 (+0200) Subject: Fix shared memory stats with threads (#1408) X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a2ace114de5accc5b635c0fa09651777c4d2a850;p=thirdparty%2Funbound.git Fix shared memory stats with threads (#1408) * - stats-shm-volley, with mesh_time_median the additions add up to the correct average that is used. * - stats-shm-volley, the stat interval is selected with offset. * - stats-shm-volley, stat totals in separate struct. The first thread zeroes it, and the last thread copies it. * - stats-shm-volley, the array is inited for a new round if one or more * - stats-shm-volley, the array is inited for a new round if one or more threads are not responsive for stat collection. * - stats-shm-volley review, typos and slightly more detailed text for comments. --------- Co-authored-by: Yorgos Thessalonikefs --- diff --git a/daemon/daemon.c b/daemon/daemon.c index 2ae7d6d05..23e9493a0 100644 --- a/daemon/daemon.c +++ b/daemon/daemon.c @@ -1076,6 +1076,12 @@ daemon_fork(struct daemon* daemon) * the thread_start() procedure. */ set_log_thread_id(daemon->workers[0], daemon->cfg); + /* If shm stats need an offset, calculate it */ + if(daemon->cfg->shm_enable && daemon->cfg->stat_interval > 0) { + daemon->stat_time_specific = 1; + daemon->stat_time_offset = + ((int)time(NULL))%daemon->cfg->stat_interval; + } #if defined(HAVE_EV_LOOP) || defined(HAVE_EV_DEFAULT_LOOP) /* in libev the first inited base gets signals */ diff --git a/daemon/daemon.h b/daemon/daemon.h index 2be8759a4..20386d7fc 100644 --- a/daemon/daemon.h +++ b/daemon/daemon.h @@ -155,7 +155,14 @@ struct daemon { /** the dnstap environment master value, copied and changed by threads*/ struct dt_env* dtenv; #endif + /** The SHM info for shared memory stats. */ struct shm_main_info* shm_info; + /** if the timeout for statistics is attempted at specific offset. + * If it is true, the stat timeout is the interval+offset, and that + * picks (roughly) the same time offset every time period. */ + int stat_time_specific; + /** if the timeout is specific, what offset in the period. */ + int stat_time_offset; /** some response-ip tags or actions are configured if true */ int use_response_ip; /** some RPZ policies are configured */ diff --git a/daemon/worker.c b/daemon/worker.c index 026abfcbc..597af56a9 100644 --- a/daemon/worker.c +++ b/daemon/worker.c @@ -2124,10 +2124,37 @@ worker_restart_timer(struct worker* worker) { if(worker->env.cfg->stat_interval > 0) { struct timeval tv; + if(worker->daemon->stat_time_specific) { + struct timeval dest, now; + int interval = worker->env.cfg->stat_interval; + int offset = worker->daemon->stat_time_offset; + int nows, spec; + if(gettimeofday(&now, NULL) < 0) + log_err("gettimeofday: %s", strerror(errno)); #ifndef S_SPLINT_S - tv.tv_sec = worker->env.cfg->stat_interval; - tv.tv_usec = 0; + nows = (int)now.tv_sec; + /* The next time is on the timer interval, at the + * specific offset, time value % interval = offset. */ + /* It relies on the integer division below to drop the + * remainder in order to calculate the expected + * result. */ + spec = ((nows-offset)/interval+1)*interval+offset; + /* This is instead of an assertion, and should not + * be needed. So assert(spec > nows), tv is going to + * be positive. */ + if(spec<=nows) spec += interval; + dest.tv_sec = spec; + dest.tv_usec = 0; #endif + /* Subtract in timeval, so the fractions of a second + * are rounded to the whole specific time. */ + timeval_subtract(&tv, &dest, &now); + } else { +#ifndef S_SPLINT_S + tv.tv_sec = worker->env.cfg->stat_interval; + tv.tv_usec = 0; +#endif + } comm_timer_set(worker->stat_timer, &tv); } } diff --git a/util/shm_side/shm_main.c b/util/shm_side/shm_main.c index 751d6d649..420adbbda 100644 --- a/util/shm_side/shm_main.c +++ b/util/shm_side/shm_main.c @@ -185,6 +185,16 @@ int shm_main_init(struct daemon* daemon) shm_stat = daemon->shm_info->ptr_ctl; shm_stat->num_threads = daemon->num; + lock_basic_init(&daemon->shm_info->lock); + daemon->shm_info->volley_in_progress = 0; + daemon->shm_info->thread_volley = (int*)calloc( + daemon->num, sizeof(int)); + if(!daemon->shm_info->thread_volley) { + log_err("shm fail: malloc failure"); + free(daemon->shm_info); + daemon->shm_info = NULL; + return 0; + } #else (void)daemon; #endif /* HAVE_SHMGET */ @@ -214,6 +224,9 @@ void shm_main_shutdown(struct daemon* daemon) if (daemon->shm_info->ptr_arr) shmdt(daemon->shm_info->ptr_arr); + lock_basic_destroy(&daemon->shm_info->lock); + free(daemon->shm_info->thread_volley); + free(daemon->shm_info); daemon->shm_info = NULL; #else @@ -221,87 +234,161 @@ void shm_main_shutdown(struct daemon* daemon) #endif /* HAVE_SHMGET */ } -void shm_main_run(struct worker *worker) +/** Copy general info into the stat structure. */ +static void +shm_general_info(struct worker* worker) { -#ifdef HAVE_SHMGET struct ub_shm_stat_info *shm_stat; - struct ub_stats_info *stat_total; - struct ub_stats_info *stat_info; - int offset; - -#ifndef S_SPLINT_S - verbose(VERB_DETAIL, "SHM run - worker [%d] - daemon [%p] - timenow(%u) - timeboot(%u)", - worker->thread_num, worker->daemon, (unsigned)worker->env.now_tv->tv_sec, (unsigned)worker->daemon->time_boot.tv_sec); -#endif - - offset = worker->thread_num + 1; - stat_total = worker->daemon->shm_info->ptr_arr; - stat_info = worker->daemon->shm_info->ptr_arr + offset; - - /* Copy data to the current position */ - server_stats_compile(worker, stat_info, 0); - - /* First thread, zero fill total, and copy general info */ - if (worker->thread_num == 0) { - - /* Copy data to the current position */ - memset(stat_total, 0, sizeof(struct ub_stats_info)); - - /* Point to data into SHM */ + /* Point to data into SHM */ #ifndef S_SPLINT_S - shm_stat = worker->daemon->shm_info->ptr_ctl; - shm_stat->time.now_sec = (long long)worker->env.now_tv->tv_sec; - shm_stat->time.now_usec = (long long)worker->env.now_tv->tv_usec; + shm_stat = worker->daemon->shm_info->ptr_ctl; + shm_stat->time.now_sec = (long long)worker->env.now_tv->tv_sec; + shm_stat->time.now_usec = (long long)worker->env.now_tv->tv_usec; #endif - stat_timeval_subtract(&shm_stat->time.up_sec, &shm_stat->time.up_usec, worker->env.now_tv, &worker->daemon->time_boot); - stat_timeval_subtract(&shm_stat->time.elapsed_sec, &shm_stat->time.elapsed_usec, worker->env.now_tv, &worker->daemon->time_last_stat); + stat_timeval_subtract(&shm_stat->time.up_sec, &shm_stat->time.up_usec, worker->env.now_tv, &worker->daemon->time_boot); + stat_timeval_subtract(&shm_stat->time.elapsed_sec, &shm_stat->time.elapsed_usec, worker->env.now_tv, &worker->daemon->time_last_stat); - shm_stat->mem.msg = (long long)slabhash_get_mem(worker->env.msg_cache); - shm_stat->mem.rrset = (long long)slabhash_get_mem(&worker->env.rrset_cache->table); - shm_stat->mem.dnscrypt_shared_secret = 0; + shm_stat->mem.msg = (long long)slabhash_get_mem(worker->env.msg_cache); + shm_stat->mem.rrset = (long long)slabhash_get_mem(&worker->env.rrset_cache->table); + shm_stat->mem.dnscrypt_shared_secret = 0; #ifdef USE_DNSCRYPT - if(worker->daemon->dnscenv) { - shm_stat->mem.dnscrypt_shared_secret = (long long)slabhash_get_mem( - worker->daemon->dnscenv->shared_secrets_cache); - shm_stat->mem.dnscrypt_nonce = (long long)slabhash_get_mem( - worker->daemon->dnscenv->nonces_cache); - } + if(worker->daemon->dnscenv) { + shm_stat->mem.dnscrypt_shared_secret = (long long)slabhash_get_mem( + worker->daemon->dnscenv->shared_secrets_cache); + shm_stat->mem.dnscrypt_nonce = (long long)slabhash_get_mem( + worker->daemon->dnscenv->nonces_cache); + } #endif - shm_stat->mem.val = (long long)mod_get_mem(&worker->env, - "validator"); - shm_stat->mem.iter = (long long)mod_get_mem(&worker->env, - "iterator"); - shm_stat->mem.respip = (long long)mod_get_mem(&worker->env, - "respip"); - - /* subnet mem value is available in shm, also when not enabled, - * to make the struct easier to memmap by other applications, - * independent of the configuration of unbound */ - shm_stat->mem.subnet = 0; + shm_stat->mem.val = (long long)mod_get_mem(&worker->env, "validator"); + shm_stat->mem.iter = (long long)mod_get_mem(&worker->env, "iterator"); + shm_stat->mem.respip = (long long)mod_get_mem(&worker->env, "respip"); + + /* subnet mem value is available in shm, also when not enabled, + * to make the struct easier to memmap by other applications, + * independent of the configuration of unbound */ + shm_stat->mem.subnet = 0; #ifdef CLIENT_SUBNET - shm_stat->mem.subnet = (long long)mod_get_mem(&worker->env, - "subnetcache"); + shm_stat->mem.subnet = (long long)mod_get_mem(&worker->env, + "subnetcache"); #endif - /* ipsecmod mem value is available in shm, also when not enabled, - * to make the struct easier to memmap by other applications, - * independent of the configuration of unbound */ - shm_stat->mem.ipsecmod = 0; + /* ipsecmod mem value is available in shm, also when not enabled, + * to make the struct easier to memmap by other applications, + * independent of the configuration of unbound */ + shm_stat->mem.ipsecmod = 0; #ifdef USE_IPSECMOD - shm_stat->mem.ipsecmod = (long long)mod_get_mem(&worker->env, - "ipsecmod"); + shm_stat->mem.ipsecmod = (long long)mod_get_mem(&worker->env, + "ipsecmod"); #endif #ifdef WITH_DYNLIBMODULE - shm_stat->mem.dynlib = (long long)mod_get_mem(&worker->env, - "dynlib"); + shm_stat->mem.dynlib = (long long)mod_get_mem(&worker->env, "dynlib"); #endif +} + +/** See if the thread is first. Caller has lock. */ +static int +shm_thread_is_first(struct shm_main_info* shm_info, int thread_num, + struct daemon* daemon) +{ + /* The usual method, all threads executed last time, and there + * is no statistics callback in progress. */ + if(!shm_info->volley_in_progress) + return 1; + /* See if we are already active, if so, the timer seems to have fired + * twice for this thread which means other thread(s) have not gone + * through their stats callbacks yet. + * (There should have been a last thread to reset + * shm_info->volley_in_progress and shm_info->thread_volley) + * The other thread(s) are not yet active during this statistics round, + * so this thread must be the first of this new round disregarding the + * other busy thread(s). + * When the other thread(s) have time again, they will process their + * stats callback and hopefully properly end a stats round where all + * threads got to calculate their statistics. */ + if(shm_info->thread_volley[thread_num] != 0) { + /* The new round starts and zeroes the total. The previous + * partial total is discarded. That means while other thread(s) + * are performing a long task, eg. loading a large zone + * perhaps, the total is not updated and stays the same in the + * shared memory area. Once that other thread(s) perform the + * statistic callback again, the total is updated again. + * + * The threads busy with long tasks have 0 in the array. + * The array is inited for a new round. */ + memset(shm_info->thread_volley, 0, + ((size_t)daemon->num) * sizeof(int)); + return 1; } + return 0; +} - server_stats_add(stat_total, stat_info); +/** See if the thread is last. Caller has lock. */ +static int +shm_thread_is_last(struct daemon* daemon) +{ + /* Being last means that all threads have been active for this stats + * round and this thread is the last one; also active. All the + * thread_volley values should be true then. */ + int i; + for(i=0; inum; i++) { + if(!daemon->shm_info->thread_volley[i]) + return 0; + } + return 1; +} - /* print the thread statistics */ - stat_total->mesh_time_median /= (double)worker->daemon->num; +void shm_main_run(struct worker *worker) +{ +#ifdef HAVE_SHMGET + struct ub_stats_info *stat_total; + struct ub_stats_info *stat_info; + int offset; + double total_mesh_time_median; + struct shm_main_info* shm_info = worker->daemon->shm_info; +#ifndef S_SPLINT_S + verbose(VERB_DETAIL, "SHM run - worker [%d] - daemon [%p] - timenow(%u) - timeboot(%u)", + worker->thread_num, worker->daemon, (unsigned)worker->env.now_tv->tv_sec, (unsigned)worker->daemon->time_boot.tv_sec); +#endif + + offset = worker->thread_num + 1; + stat_total = shm_info->ptr_arr; + stat_info = shm_info->ptr_arr + offset; + + /* Copy data to the current position */ + server_stats_compile(worker, stat_info, 0); + + /* Lock the lock and see if this thread is first or last of the + * stat threads. It can then zero value or sum up values. */ + lock_basic_lock(&shm_info->lock); + if(shm_thread_is_first(shm_info, worker->thread_num, worker->daemon)) { + /* First thread, zero fill total. */ + memset(&shm_info->total_in_progress, 0, + sizeof(struct ub_stats_info)); + shm_info->volley_in_progress = 1; + } + shm_info->thread_volley[worker->thread_num] = 1; + if(worker->thread_num == 0) { + /* Thread 0, copy general info. */ + shm_general_info(worker); + } + /* Add thread data to the total */ + total_mesh_time_median = shm_info->total_in_progress.mesh_time_median; + server_stats_add(&shm_info->total_in_progress, stat_info); + /* By adding the value/num per stat thread, for the median, + * it is going to add up to the sum/num. */ + shm_info->total_in_progress.mesh_time_median = total_mesh_time_median + + (stat_info->mesh_time_median/(double)worker->daemon->num); + + if(shm_thread_is_last(worker->daemon)) { + /* Copy over the total */ + memcpy(stat_total, &shm_info->total_in_progress, + sizeof(struct ub_stats_info)); + shm_info->volley_in_progress = 0; + memset(shm_info->thread_volley, 0, + ((size_t)worker->daemon->num) * sizeof(int)); + } + lock_basic_unlock(&shm_info->lock); #else (void)worker; #endif /* HAVE_SHMGET */ diff --git a/util/shm_side/shm_main.h b/util/shm_side/shm_main.h index 76c60e484..71a27abbc 100644 --- a/util/shm_side/shm_main.h +++ b/util/shm_side/shm_main.h @@ -47,6 +47,8 @@ struct worker; /* get struct ub_shm_stat_info */ #include "libunbound/unbound.h" +#include "util/locks.h" + /** * The SHM info. */ @@ -59,6 +61,19 @@ struct shm_main_info { int key; int id_ctl; int id_arr; + + /** This mutex is on the volley information. */ + lock_basic_type lock; + /** If there is a volley, a number of stat timer callbacks by the + * threads, in progress. If not, there is no volley in progress and the + * previous stat run has terminated succesfully for all threads. + * Usually activated by the first thread and deactivated by the last + * thread that starts its stat callback. */ + int volley_in_progress; + /** Per thread, if they have put in stats. 0 if not. */ + int* thread_volley; + /** The total stats of the thread stat timers, it is in progress */ + struct ub_stats_info total_in_progress; }; int shm_main_init(struct daemon* daemon);