]> git.ipfire.org Git - thirdparty/bird.git/commitdiff
Graceful recovery: converted to obstacles
authorMaria Matejka <mq@ucw.cz>
Mon, 23 Dec 2024 20:06:26 +0000 (21:06 +0100)
committerMaria Matejka <mq@ucw.cz>
Thu, 9 Jan 2025 09:08:27 +0000 (10:08 +0100)
Yet another refcounting mechanism had a locking collision.

nest/proto.c
nest/protocol.h

index 6fa74e9f175e51e60b3d5a528935256e168ebc18..caf99829b5171bed2a38c15e45537b6616144646 100644 (file)
@@ -31,15 +31,8 @@ static list STATIC_LIST_INIT(protocol_list);
 #define CD(c, msg, args...) ({ if (c->debug & D_STATES) log(L_TRACE "%s.%s: " msg, c->proto->name, c->name ?: "?", ## args); })
 #define PD(p, msg, args...) ({ if (p->debug & D_STATES) log(L_TRACE "%s: " msg, p->name, ## args); })
 
-static timer *gr_wait_timer;
-
-#define GRS_NONE       0
-#define GRS_INIT       1
-#define GRS_ACTIVE     2
-#define GRS_DONE       3
-
-static int graceful_restart_state;
-static u32 graceful_restart_locks;
+static struct graceful_recovery_context _graceful_recovery_context;
+OBSREF(struct graceful_recovery_context) graceful_recovery_context;
 
 static char *p_states[] = { "DOWN", "START", "UP", "STOP" };
 static char *c_states[] = { "DOWN", "START", "UP", "STOP", "RESTART" };
@@ -912,7 +905,7 @@ channel_do_stop(struct channel *c)
   ev_postpone(&c->reimport_event);
 
   c->gr_wait = 0;
-  if (c->gr_lock)
+  if (OBSREF_GET(c->gr_lock))
     channel_graceful_restart_unlock(c);
 
   CALL(c->class->shutdown, c);
@@ -1407,7 +1400,7 @@ proto_start(struct proto *p)
   DBG("Kicking %s up\n", p->name);
   PD(p, "Starting");
 
-  if (graceful_restart_state == GRS_INIT)
+  if (OBSREF_GET(graceful_recovery_context))
     p->gr_recovery = 1;
 
   if (p->cf->loop_order != DOMAIN_ORDER(the_bird))
@@ -1921,7 +1914,45 @@ proto_enable(struct proto *p)
  *
  */
 
-static void graceful_restart_done(timer *t);
+/**
+ * graceful_restart_done - finalize graceful restart
+ * @t: unused
+ *
+ * When there are no locks on graceful restart, the functions finalizes the
+ * graceful restart recovery. Protocols postponing route export until the end of
+ * the recovery are awakened and the export to them is enabled.
+ */
+static void
+graceful_recovery_done(struct callback *_ UNUSED)
+{
+  ASSERT_DIE(birdloop_inside(&main_birdloop));
+  ASSERT_DIE(_graceful_recovery_context.grc_state == GRS_ACTIVE);
+
+  tm_stop(&_graceful_recovery_context.wait_timer);
+  log(L_INFO "Graceful recovery done");
+
+  WALK_TLIST(proto, p, &global_proto_list)
+    PROTO_LOCKED_FROM_MAIN(p)
+    {
+      p->gr_recovery = 0;
+
+      struct channel *c;
+      WALK_LIST(c, p->channels)
+      {
+       ASSERT_DIE(!OBSREF_GET(c->gr_lock));
+
+       /* Resume postponed export of routes */
+       if ((c->channel_state == CS_UP) && c->gr_wait && p->rt_notify)
+         channel_start_export(c);
+
+       /* Cleanup */
+       c->gr_wait = 0;
+      }
+    }
+
+  _graceful_recovery_context.grc_state = GRS_DONE;
+}
+
 
 /**
  * graceful_restart_recovery - request initial graceful restart recovery
@@ -1933,7 +1964,30 @@ static void graceful_restart_done(timer *t);
 void
 graceful_restart_recovery(void)
 {
-  graceful_restart_state = GRS_INIT;
+  obstacle_target_init(
+      &_graceful_recovery_context.obstacles,
+      &_graceful_recovery_context.obstacles_cleared,
+      &root_pool, "Graceful recovery");
+
+  OBSREF_SET(graceful_recovery_context, &_graceful_recovery_context);
+  _graceful_recovery_context.grc_state = GRS_INIT;
+}
+
+static void
+graceful_recovery_timeout(timer *t UNUSED)
+{
+  log(L_INFO "Graceful recovery timeout");
+  WALK_TLIST(proto, p, &global_proto_list)
+    PROTO_LOCKED_FROM_MAIN(p)
+    {
+      struct channel *c;
+      WALK_LIST(c, p->channels)
+       if (OBSREF_GET(c->gr_lock))
+       {
+         log(L_INFO "Graceful recovery: Not waiting for %s.%s", p->name, c->name);
+         OBSREF_CLEAR(c->gr_lock);
+       }
+    }
 }
 
 /**
@@ -1946,73 +2000,35 @@ graceful_restart_recovery(void)
 void
 graceful_restart_init(void)
 {
-  if (!graceful_restart_state)
+  if (!OBSREF_GET(graceful_recovery_context))
     return;
 
-  log(L_INFO "Graceful restart started");
+  log(L_INFO "Graceful recovery started");
 
-  if (!graceful_restart_locks)
-  {
-    graceful_restart_done(NULL);
-    return;
-  }
+  _graceful_recovery_context.grc_state = GRS_ACTIVE;
 
-  graceful_restart_state = GRS_ACTIVE;
-  gr_wait_timer = tm_new_init(proto_pool, graceful_restart_done, NULL, 0, 0);
+  _graceful_recovery_context.wait_timer = (timer) { .hook = graceful_recovery_timeout };
   u32 gr_wait = atomic_load_explicit(&global_runtime, memory_order_relaxed)->gr_wait;
-  tm_start(gr_wait_timer, gr_wait S);
-}
-
-/**
- * graceful_restart_done - finalize graceful restart
- * @t: unused
- *
- * When there are no locks on graceful restart, the functions finalizes the
- * graceful restart recovery. Protocols postponing route export until the end of
- * the recovery are awakened and the export to them is enabled. All other
- * related state is cleared. The function is also called when the graceful
- * restart wait timer fires (but there are still some locks).
- */
-static void
-graceful_restart_done(timer *t)
-{
-  log(L_INFO "Graceful restart done");
-  graceful_restart_state = GRS_DONE;
-
-  WALK_TLIST(proto, p, &global_proto_list)
-  {
-    if (!p->gr_recovery)
-      continue;
-
-    struct channel *c;
-    WALK_LIST(c, p->channels)
-    {
-      /* Resume postponed export of routes */
-      if ((c->channel_state == CS_UP) && c->gr_wait && p->rt_notify)
-       channel_start_export(c);
+  tm_start(&_graceful_recovery_context.wait_timer, gr_wait S);
 
-      /* Cleanup */
-      c->gr_wait = 0;
-      c->gr_lock = 0;
-    }
-
-    p->gr_recovery = 0;
-  }
+  callback_init(&_graceful_recovery_context.obstacles_cleared, graceful_recovery_done, &main_birdloop);
 
-  graceful_restart_locks = 0;
-
-  rfree(t);
+  /* The last clearing of obstacle reference will cause
+   * the graceful recovery finish immediately. */
+  OBSREF_CLEAR(graceful_recovery_context);
 }
 
 void
 graceful_restart_show_status(void)
 {
-  if (graceful_restart_state != GRS_ACTIVE)
+  if (_graceful_recovery_context.grc_state != GRS_ACTIVE)
     return;
 
   cli_msg(-24, "Graceful restart recovery in progress");
-  cli_msg(-24, "  Waiting for %d channels to recover", graceful_restart_locks);
-  cli_msg(-24, "  Wait timer is %t/%u", tm_remains(gr_wait_timer),
+  cli_msg(-24, "  Waiting for %u channels to recover",
+      obstacle_target_count(&_graceful_recovery_context.obstacles));
+  cli_msg(-24, "  Wait timer is %t/%u",
+      tm_remains(&_graceful_recovery_context.wait_timer),
       atomic_load_explicit(&global_runtime, memory_order_relaxed)->gr_wait);
 }
 
@@ -2032,14 +2048,22 @@ graceful_restart_show_status(void)
 void
 channel_graceful_restart_lock(struct channel *c)
 {
-  ASSERT(graceful_restart_state == GRS_INIT);
-  ASSERT(c->proto->gr_recovery);
+  ASSERT_DIE(birdloop_inside(&main_birdloop));
 
-  if (c->gr_lock)
+  if (OBSREF_GET(c->gr_lock))
     return;
 
-  c->gr_lock = 1;
-  graceful_restart_locks++;
+  switch (_graceful_recovery_context.grc_state)
+  {
+    case GRS_INIT:
+    case GRS_ACTIVE:
+      OBSREF_SET(c->gr_lock, &_graceful_recovery_context);
+      break;
+
+    case GRS_NONE:
+    case GRS_DONE:
+      break;
+  }
 }
 
 /**
@@ -2052,18 +2076,10 @@ channel_graceful_restart_lock(struct channel *c)
 void
 channel_graceful_restart_unlock(struct channel *c)
 {
-  if (!c->gr_lock)
-    return;
-
-  c->gr_lock = 0;
-  graceful_restart_locks--;
-
-  if ((graceful_restart_state == GRS_ACTIVE) && !graceful_restart_locks)
-    tm_start(gr_wait_timer, 0);
+  OBSREF_CLEAR(c->gr_lock);
 }
 
 
-
 /**
  * protos_dump_all - dump status of all protocols
  *
@@ -2615,9 +2631,9 @@ channel_show_info(struct channel *c)
   cli_msg(-1006, "    Input filter:   %s", filter_name(c->in_filter));
   cli_msg(-1006, "    Output filter:  %s", filter_name(c->out_filter));
 
-  if (graceful_restart_state == GRS_ACTIVE)
+  if (_graceful_recovery_context.grc_state == GRS_ACTIVE)
     cli_msg(-1006, "    GR recovery:   %s%s",
-           c->gr_lock ? " pending" : "",
+           OBSREF_GET(c->gr_lock) ? " pending" : "",
            c->gr_wait ? " waiting" : "");
 
   channel_show_limit(&c->rx_limit, "Receive limit:", c->limit_active & (1 << PLD_RX), c->limit_actions[PLD_RX]);
index 2bfa1628a92bbf38c5f58f70e6b2965f4c415bea..ec561b2636084d30488f51d909b924514ab9bc49 100644 (file)
@@ -659,7 +659,7 @@ struct channel {
 
   u8 channel_state;
   u8 reloadable;                       /* Hook reload_routes() is allowed on the channel */
-  u8 gr_lock;                          /* Graceful restart mechanism should wait for this channel */
+  OBSREF(struct graceful_recovery_context) gr_lock;    /* Graceful restart mechanism should wait for this channel */
   u8 gr_wait;                          /* Route export to channel is postponed until graceful restart */
 
   u32 obstacles;                       /* External obstacles remaining before cleanup */
@@ -763,4 +763,16 @@ void *channel_config_new(const struct channel_class *cc, const char *name, uint
 void *channel_config_get(const struct channel_class *cc, const char *name, uint net_type, struct proto_config *proto);
 int channel_reconfigure(struct channel *c, struct channel_config *cf);
 
+struct graceful_recovery_context {
+  struct obstacle_target obstacles;
+  struct callback obstacles_cleared;
+  enum {
+    GRS_NONE,
+    GRS_INIT,
+    GRS_ACTIVE,
+    GRS_DONE,
+  } grc_state;
+  timer wait_timer;
+};
+
 #endif