- fast-reload, fix to poll every thread with nopause to make certain that

author W.C.A. Wijngaards <wouter@nlnetlabs.nl>

Thu, 21 Mar 2024 16:20:46 +0000 (17:20 +0100)

committer W.C.A. Wijngaards <wouter@nlnetlabs.nl>

Thu, 21 Mar 2024 16:20:46 +0000 (17:20 +0100)
author W.C.A. Wijngaards <wouter@nlnetlabs.nl>
Thu, 21 Mar 2024 16:20:46 +0000 (17:20 +0100)
committer W.C.A. Wijngaards <wouter@nlnetlabs.nl>
Thu, 21 Mar 2024 16:20:46 +0000 (17:20 +0100)
diff --git a/daemon/remote.c b/daemon/remote.c

index ba4c8d26db18fe320577975ba6066392bda25458..2b77d8763cc17bda78494bd65cabb2d54a751455 100644 (file)
--- a/daemon/remote.c
+++ b/daemon/remote.c
@@ -3564,6 +3564,8 @@ fr_notification_to_string(enum fast_reload_notification status)
                 return "reload_stop";
         case fast_reload_notification_reload_ack:
                 return "reload_ack";
+       case fast_reload_notification_reload_nopause_poll:
+               return "reload_nopause_poll";
         case fast_reload_notification_reload_start:
                 return "reload_start";
         default:
@@ -4331,6 +4333,27 @@ fr_load_config(struct fast_reload_thread* fr, struct timeval* time_read,
                 log_err("gettimeofday: %s", strerror(errno));
  
         /* Delete old. */
+       if(fr_poll_for_quit(fr)) {
+               config_delete(newcfg);
+               fr_construct_clear(&ct);
+               return 1;
+       }
+       if(fr->fr_nopause) {
+               /* Poll every thread, with a no-work poll item over the
+                * command pipe. This makes the worker thread surely move
+                * to deal with that event, and thus the thread is no longer
+                * holding, eg. a string item from the old config struct.
+                * And then the old config struct can safely be deleted.
+                * Only needed when nopause is used, because without that
+                * the worker threads are already waiting on a command pipe
+                * item. This nopause command pipe item does not take work,
+                * it returns immediately, so it does not delay the workers.
+                * They can be polled one at a time. But its processing causes
+                * the worker to have released data items from old config. */
+               fr_send_notification(fr,
+                       fast_reload_notification_reload_nopause_poll);
+               fr_poll_for_ack(fr);
+       }
         config_delete(newcfg);
         fr_construct_clear(&ct);
         return 1;
@@ -5022,6 +5045,31 @@ fr_main_perform_reload_stop(struct fast_reload_thread* fr)
         verbose(VERB_ALGO, "worker resume after reload");
  }
  
+/** Fast reload, the main thread performs the nopause poll. It polls every
+ * other worker thread briefly over the command pipe ipc. The command takes
+ * no time for the worker, it can return immediately. After that it sends
+ * an acknowledgement to the fastreload thread. */
+static void
+fr_main_perform_reload_nopause_poll(struct fast_reload_thread* fr)
+{
+       struct daemon* daemon = fr->worker->daemon;
+       int i;
+
+       /* Send the reload_poll to other threads. They can respond
+        * one at a time. */
+       for(i=0; i<daemon->num; i++) {
+               if(i == fr->worker->thread_num)
+                       continue; /* Do not send to ourselves. */
+               worker_send_cmd(daemon->workers[i], worker_cmd_reload_poll);
+       }
+
+       /* Wait for the other threads to ack. */
+       fr_read_ack_from_workers(fr);
+
+       /* Send ack to fast reload thread. */
+       fr_send_cmd_to(fr, fast_reload_notification_reload_ack, 0, 1);
+}
+
  /** Fast reload, perform the command received from the fast reload thread */
  static void
  fr_main_perform_cmd(struct fast_reload_thread* fr,
@@ -5037,6 +5085,8 @@ fr_main_perform_cmd(struct fast_reload_thread* fr,
                 fr_main_perform_done(fr);
         } else if(status == fast_reload_notification_reload_stop) {
                 fr_main_perform_reload_stop(fr);
+       } else if(status == fast_reload_notification_reload_nopause_poll) {
+               fr_main_perform_reload_nopause_poll(fr);
         } else {
                 log_err("main received unknown status from fast reload: %d %s",
                         (int)status, fr_notification_to_string(status));
diff --git a/daemon/remote.h b/daemon/remote.h

index 180fabed826eeb251ed95a354b271f882e3333f3..4c4f442f44979657efe897d076a334e59cf2a789 100644 (file)
--- a/daemon/remote.h
+++ b/daemon/remote.h
@@ -143,7 +143,10 @@ enum fast_reload_notification {
         /** ack the stop as part of the reload */
         fast_reload_notification_reload_ack = 7,
         /** resume from stop as part of the reload */
-       fast_reload_notification_reload_start = 8
+       fast_reload_notification_reload_start = 8,
+       /** the fast reload thread wants the mainthread to poll workers,
+        * after the reload, sent when nopause is used */
+       fast_reload_notification_reload_nopause_poll = 9
  };
  
  /**
diff --git a/daemon/worker.c b/daemon/worker.c

index be8c81302dfbdf04cde80747a89a8cb1aea1e282..1738137e4ae570c1feb158f999c5f9fc31da1bac 100644 (file)
--- a/daemon/worker.c
+++ b/daemon/worker.c
@@ -369,14 +369,21 @@ worker_check_request(sldns_buffer* pkt, struct worker* worker,
         return;
  }
  
-/** stop and wait to resume the worker */
+/**
+ * Send fast-reload acknowledgement to the mainthread in one byte.
+ * This signals that this works has received the previous command.
+ * The worker is waiting if that is after a reload_stop command.
+ * Or the worker has briefly processed the event itself, and in doing so
+ * released data pointers to old config, after a reload_poll command.
+ */
  static void
-worker_stop_and_wait(struct worker* worker)
+worker_send_reload_ack(struct worker* worker)
  {
+       /* If this is clipped to 8 bits because thread_num>255, then that
+        * is not a problem, the receiver counts the number of bytes received.
+        * The number is informative only. */
         uint8_t c = (uint8_t)worker->thread_num;
         ssize_t ret;
-       uint8_t* buf = NULL;
-       uint32_t len = 0, cmd;
         while(1) {
                 ret = send(worker->daemon->fast_reload_thread->commreload[1],
                         &c, 1, 0);
@@ -400,6 +407,15 @@ worker_stop_and_wait(struct worker* worker)
                 }
                 break;
         }
+}
+
+/** stop and wait to resume the worker */
+static void
+worker_stop_and_wait(struct worker* worker)
+{
+       uint8_t* buf = NULL;
+       uint32_t len = 0, cmd;
+       worker_send_reload_ack(worker);
         /* wait for reload */
         if(!tube_read_msg(worker->cmd, &buf, &len, 0)) {
                 log_err("worker reload read reply failed");
@@ -468,6 +484,10 @@ worker_handle_control_cmd(struct tube* ATTR_UNUSED(tube), uint8_t* msg,
                 verbose(VERB_ALGO, "got control cmd reload_stop");
                 worker_stop_and_wait(worker);
                 break;
+       case worker_cmd_reload_poll:
+               verbose(VERB_ALGO, "got control cmd reload_poll");
+               worker_send_reload_ack(worker);
+               break;
         default:
                 log_err("bad command %d", (int)cmd);
                 break;
diff --git a/daemon/worker.h b/daemon/worker.h

index 84e851d06bbf96ef6a1ac2c7165d3e157f80c5a8..b7bb52fd715ba3c42910d7dcf7a6cec467545fcd 100644 (file)
--- a/daemon/worker.h
+++ b/daemon/worker.h
@@ -76,7 +76,9 @@ enum worker_commands {
         /** for fast-reload, perform stop */
         worker_cmd_reload_stop,
         /** for fast-reload, start again */
-       worker_cmd_reload_start
+       worker_cmd_reload_start,
+       /** for fast-reload, poll to make sure worker has released data */
+       worker_cmd_reload_poll
  };
  
  /**
diff --git a/doc/unbound-control.8.in b/doc/unbound-control.8.in

index e67a628d80f1b7557aeef9263a6ebf0f8e3bd94e..a3b10260db2e7a629376fcd48740b794b4053832 100644 (file)
--- a/doc/unbound-control.8.in
+++ b/doc/unbound-control.8.in
@@ -84,7 +84,10 @@ updated at the same time, so that they are viewed consistently, either old
  or new values together. The option makes the reload time take eg. 3
  microseconds instead of 0.3 milliseconds during which the worker threads are
  interrupted. So, the interruption is much shorter, at the expense of some
-inconsistency.
+inconsistency. After the reload itself, every worker thread is briefly
+contacted to make them release resources, this makes the delete timing
+a little longer, and takes up time from the remote control servicing
+worker thread.
  .IP
  The '+d' option makes the reload drop queries that the worker threads are
  working on. This is like flush_requestlist. Without it the queries are kept
author	W.C.A. Wijngaards <wouter@nlnetlabs.nl>
	Thu, 21 Mar 2024 16:20:46 +0000 (17:20 +0100)
committer	W.C.A. Wijngaards <wouter@nlnetlabs.nl>
	Thu, 21 Mar 2024 16:20:46 +0000 (17:20 +0100)
daemon/remote.c		patch \| blob \| blame \| history
daemon/remote.h		patch \| blob \| blame \| history
daemon/worker.c		patch \| blob \| blame \| history
daemon/worker.h		patch \| blob \| blame \| history
doc/unbound-control.8.in		patch \| blob \| blame \| history