From 46bdbc7e6711e46d279b7445d5252ecff0e56a91 Mon Sep 17 00:00:00 2001 From: "W.C.A. Wijngaards" Date: Thu, 21 Mar 2024 17:20:46 +0100 Subject: [PATCH] - fast-reload, fix to poll every thread with nopause to make certain that resources are not held by the threads and can be deleted. --- daemon/remote.c | 50 ++++++++++++++++++++++++++++++++++++++++ daemon/remote.h | 5 +++- daemon/worker.c | 28 ++++++++++++++++++---- daemon/worker.h | 4 +++- doc/unbound-control.8.in | 5 +++- 5 files changed, 85 insertions(+), 7 deletions(-) diff --git a/daemon/remote.c b/daemon/remote.c index ba4c8d26d..2b77d8763 100644 --- a/daemon/remote.c +++ b/daemon/remote.c @@ -3564,6 +3564,8 @@ fr_notification_to_string(enum fast_reload_notification status) return "reload_stop"; case fast_reload_notification_reload_ack: return "reload_ack"; + case fast_reload_notification_reload_nopause_poll: + return "reload_nopause_poll"; case fast_reload_notification_reload_start: return "reload_start"; default: @@ -4331,6 +4333,27 @@ fr_load_config(struct fast_reload_thread* fr, struct timeval* time_read, log_err("gettimeofday: %s", strerror(errno)); /* Delete old. */ + if(fr_poll_for_quit(fr)) { + config_delete(newcfg); + fr_construct_clear(&ct); + return 1; + } + if(fr->fr_nopause) { + /* Poll every thread, with a no-work poll item over the + * command pipe. This makes the worker thread surely move + * to deal with that event, and thus the thread is no longer + * holding, eg. a string item from the old config struct. + * And then the old config struct can safely be deleted. + * Only needed when nopause is used, because without that + * the worker threads are already waiting on a command pipe + * item. This nopause command pipe item does not take work, + * it returns immediately, so it does not delay the workers. + * They can be polled one at a time. But its processing causes + * the worker to have released data items from old config. */ + fr_send_notification(fr, + fast_reload_notification_reload_nopause_poll); + fr_poll_for_ack(fr); + } config_delete(newcfg); fr_construct_clear(&ct); return 1; @@ -5022,6 +5045,31 @@ fr_main_perform_reload_stop(struct fast_reload_thread* fr) verbose(VERB_ALGO, "worker resume after reload"); } +/** Fast reload, the main thread performs the nopause poll. It polls every + * other worker thread briefly over the command pipe ipc. The command takes + * no time for the worker, it can return immediately. After that it sends + * an acknowledgement to the fastreload thread. */ +static void +fr_main_perform_reload_nopause_poll(struct fast_reload_thread* fr) +{ + struct daemon* daemon = fr->worker->daemon; + int i; + + /* Send the reload_poll to other threads. They can respond + * one at a time. */ + for(i=0; inum; i++) { + if(i == fr->worker->thread_num) + continue; /* Do not send to ourselves. */ + worker_send_cmd(daemon->workers[i], worker_cmd_reload_poll); + } + + /* Wait for the other threads to ack. */ + fr_read_ack_from_workers(fr); + + /* Send ack to fast reload thread. */ + fr_send_cmd_to(fr, fast_reload_notification_reload_ack, 0, 1); +} + /** Fast reload, perform the command received from the fast reload thread */ static void fr_main_perform_cmd(struct fast_reload_thread* fr, @@ -5037,6 +5085,8 @@ fr_main_perform_cmd(struct fast_reload_thread* fr, fr_main_perform_done(fr); } else if(status == fast_reload_notification_reload_stop) { fr_main_perform_reload_stop(fr); + } else if(status == fast_reload_notification_reload_nopause_poll) { + fr_main_perform_reload_nopause_poll(fr); } else { log_err("main received unknown status from fast reload: %d %s", (int)status, fr_notification_to_string(status)); diff --git a/daemon/remote.h b/daemon/remote.h index 180fabed8..4c4f442f4 100644 --- a/daemon/remote.h +++ b/daemon/remote.h @@ -143,7 +143,10 @@ enum fast_reload_notification { /** ack the stop as part of the reload */ fast_reload_notification_reload_ack = 7, /** resume from stop as part of the reload */ - fast_reload_notification_reload_start = 8 + fast_reload_notification_reload_start = 8, + /** the fast reload thread wants the mainthread to poll workers, + * after the reload, sent when nopause is used */ + fast_reload_notification_reload_nopause_poll = 9 }; /** diff --git a/daemon/worker.c b/daemon/worker.c index be8c81302..1738137e4 100644 --- a/daemon/worker.c +++ b/daemon/worker.c @@ -369,14 +369,21 @@ worker_check_request(sldns_buffer* pkt, struct worker* worker, return; } -/** stop and wait to resume the worker */ +/** + * Send fast-reload acknowledgement to the mainthread in one byte. + * This signals that this works has received the previous command. + * The worker is waiting if that is after a reload_stop command. + * Or the worker has briefly processed the event itself, and in doing so + * released data pointers to old config, after a reload_poll command. + */ static void -worker_stop_and_wait(struct worker* worker) +worker_send_reload_ack(struct worker* worker) { + /* If this is clipped to 8 bits because thread_num>255, then that + * is not a problem, the receiver counts the number of bytes received. + * The number is informative only. */ uint8_t c = (uint8_t)worker->thread_num; ssize_t ret; - uint8_t* buf = NULL; - uint32_t len = 0, cmd; while(1) { ret = send(worker->daemon->fast_reload_thread->commreload[1], &c, 1, 0); @@ -400,6 +407,15 @@ worker_stop_and_wait(struct worker* worker) } break; } +} + +/** stop and wait to resume the worker */ +static void +worker_stop_and_wait(struct worker* worker) +{ + uint8_t* buf = NULL; + uint32_t len = 0, cmd; + worker_send_reload_ack(worker); /* wait for reload */ if(!tube_read_msg(worker->cmd, &buf, &len, 0)) { log_err("worker reload read reply failed"); @@ -468,6 +484,10 @@ worker_handle_control_cmd(struct tube* ATTR_UNUSED(tube), uint8_t* msg, verbose(VERB_ALGO, "got control cmd reload_stop"); worker_stop_and_wait(worker); break; + case worker_cmd_reload_poll: + verbose(VERB_ALGO, "got control cmd reload_poll"); + worker_send_reload_ack(worker); + break; default: log_err("bad command %d", (int)cmd); break; diff --git a/daemon/worker.h b/daemon/worker.h index 84e851d06..b7bb52fd7 100644 --- a/daemon/worker.h +++ b/daemon/worker.h @@ -76,7 +76,9 @@ enum worker_commands { /** for fast-reload, perform stop */ worker_cmd_reload_stop, /** for fast-reload, start again */ - worker_cmd_reload_start + worker_cmd_reload_start, + /** for fast-reload, poll to make sure worker has released data */ + worker_cmd_reload_poll }; /** diff --git a/doc/unbound-control.8.in b/doc/unbound-control.8.in index e67a628d8..a3b10260d 100644 --- a/doc/unbound-control.8.in +++ b/doc/unbound-control.8.in @@ -84,7 +84,10 @@ updated at the same time, so that they are viewed consistently, either old or new values together. The option makes the reload time take eg. 3 microseconds instead of 0.3 milliseconds during which the worker threads are interrupted. So, the interruption is much shorter, at the expense of some -inconsistency. +inconsistency. After the reload itself, every worker thread is briefly +contacted to make them release resources, this makes the delete timing +a little longer, and takes up time from the remote control servicing +worker thread. .IP The '+d' option makes the reload drop queries that the worker threads are working on. This is like flush_requestlist. Without it the queries are kept -- 2.47.2