rework to respect response_window, etc.

author Alan T. DeKok <aland@freeradius.org>

Mon, 23 Aug 2021 19:49:59 +0000 (15:49 -0400)

committer Alan T. DeKok <aland@freeradius.org>

Mon, 23 Aug 2021 19:51:46 +0000 (15:51 -0400)
author Alan T. DeKok <aland@freeradius.org>
Mon, 23 Aug 2021 19:49:59 +0000 (15:49 -0400)
committer Alan T. DeKok <aland@freeradius.org>
Mon, 23 Aug 2021 19:51:46 +0000 (15:51 -0400)
diff --git a/src/modules/rlm_radius/rlm_radius_udp.c b/src/modules/rlm_radius/rlm_radius_udp.c

index b8bee724ef6b146f626ae908eb32f39efe8fda62..dfbb2a30dc144a7cf17ab874d3960276b6896569 100644 (file)
--- a/src/modules/rlm_radius/rlm_radius_udp.c
+++ b/src/modules/rlm_radius/rlm_radius_udp.c
@@ -460,7 +460,7 @@ static void conn_error_status_check(UNUSED fr_event_list_t *el, UNUSED int fd, U
         fr_connection_signal_reconnect(conn, FR_CONNECTION_FAILED);
  }
  
-/** Status check timedout
+/** Status check timer when opening the connection for the first time.
   *
   * Setup retries, or fail the connection.
   */
@@ -1532,15 +1532,52 @@ static int encode(rlm_radius_udp_t const *inst, request_t *request, udp_request_
  /** Revive a connection after "revive_interval"
   *
   */
-static void revive_timer(UNUSED fr_event_list_t *el, UNUSED fr_time_t now, void *uctx)
+static void revive_timeout(UNUSED fr_event_list_t *el, UNUSED fr_time_t now, void *uctx)
  {
         fr_trunk_connection_t   *tconn = talloc_get_type_abort(uctx, fr_trunk_connection_t);
         udp_handle_t            *h = talloc_get_type_abort(tconn->conn->h, udp_handle_t);
  
-       INFO("%s - Shutting down and reviving connection %s", h->module_name, h->name);
+       INFO("%s - Reviving connection %s", h->module_name, h->name);
         fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED);
  }
  
+/** Mark a connection dead after "zombie_interval"
+ *
+ */
+static void zombie_timeout(fr_event_list_t *el, fr_time_t now, void *uctx)
+{
+       fr_trunk_connection_t   *tconn = talloc_get_type_abort(uctx, fr_trunk_connection_t);
+       udp_handle_t            *h = talloc_get_type_abort(tconn->conn->h, udp_handle_t);
+
+       INFO("%s - No replies during 'zombie_period', marking connection %s as dead", h->module_name, h->name);
+
+       /*
+        *      Don't use this connection, and re-queue all of its
+        *      requests onto other connections.
+        */
+       fr_trunk_connection_signal_inactive(tconn);
+       (void) fr_trunk_connection_requests_requeue(tconn, FR_TRUNK_REQUEST_STATE_ALL, 0, false);
+
+       /*
+        *      We do have status checks.  Try to reconnect the
+        *      connection immediately.  If the status checks pass,
+        *      then the connection will be marked "alive"
+        */
+       if (h->inst->parent->status_check) {
+               fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED);
+               return;
+       }
+
+       /*
+        *      Revive the connection after a time.
+        */
+       if (fr_event_timer_at(h, el, &h->zombie_ev, now + h->inst->parent->revive_interval, revive_timeout, h) < 0) {
+               ERROR("Failed inserting revive timeout for connection");
+               fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED);
+       }
+}
+
+
  /** See if the connection is zombied.
   *
   *     We check for zombie when major events happen:
@@ -1561,12 +1598,10 @@ static void revive_timer(UNUSED fr_event_list_t *el, UNUSED fr_time_t now, void
   *  for zombie at BOTH the timeout and the mux / write function.
   *
   * @return
- *     - true if a connection state change was triggered.
- *       The connection is likely now a zombie or was reconnected.
- *     - false if the connection did not change state.  It may
- *       still be a zombie, but it was a zombie when this
+ *     - true if the connection is zombie.
+ *     - false if the connection is not zombie.
   */
-static bool check_for_zombie(fr_event_list_t *el, fr_trunk_connection_t *tconn, fr_time_t now)
+static bool check_for_zombie(fr_event_list_t *el, fr_trunk_connection_t *tconn, fr_time_t now, fr_time_t last_sent)
  {
         udp_handle_t    *h = talloc_get_type_abort(tconn->conn->h, udp_handle_t);
  
@@ -1577,104 +1612,84 @@ static bool check_for_zombie(fr_event_list_t *el, fr_trunk_connection_t *tconn,
         fr_assert(!h->inst->replicate);
  
         /*
-        *      If there's already a zombie check started, don't do
-        *      another one.
-        *
-        *      Or if we never sent a packet, we don't know (or care)
-        *      if the home server is up.
-        *
-        *      Or if we had sent packets, and then went idle.
+        *      If we're status checking OR already zombie, don't go to zombie
          *
-        *      Or we had replies, and then went idle.
-        *
-        *      We do checks for both sent && replied, because we
-        *      could have sent packets without getting replies (and
-        *      then mark it zombie), or we could have gotten some
-        *      replies which then stopped coming back (and then mark
-        *      it zombie).
          */
-       if (h->status_checking || h->zombie_ev || !h->last_sent || (h->last_sent <= h->last_idle) ||
-           (h->last_reply && (h->last_reply <= h->last_idle))) {
-               return false;
-       }
+       if (h->status_checking || h->zombie_ev) return true;
  
         if (now == 0) now = fr_time();
  
         /*
-        *      We've sent a packet since we last went idle, and/or
-        *      we've received replies since we last went idle.
-        *
-        *      If we have a reply, then set the zombie timeout from
-        *      when we received the last reply.
-        *
-        *      If we haven't seen a reply, then set the zombie
-        *      timeout from when we first started sending packets.
+        *      We received a reply since this packet was sent, the connection isn't zombie.
          */
-       if (h->last_reply) {
-               if ((h->last_reply + h->inst->parent->zombie_period) >= now) return false;
-               DEBUG2("%s - We have passed 'zombie_period' time since the last reply on connection %s",
-                      h->module_name, h->name);
-       } else {
-               if ((h->first_sent + h->inst->parent->zombie_period) >= now) return false;
-               DEBUG2("%s - We have passed 'zombie_period' time since we first sent a packet, and "
-                      "there have been no replies on connection %s", h->module_name, h->name);
-       }
+       if (h->last_reply >= last_sent) return false;
  
         /*
-        *      No status checks: this connection is dead.
-        *
-        *      We will requeue this packet on another
-        *      connection.
+        *      If we've seen ANY response in the allowed window, then the connection is still alive.
          */
-       if (!h->inst->parent->status_check) {
-               fr_time_t when;
-
-               WARN("%s - Connection failed.  Reviving it in %pVs", h->module_name,
-                    fr_box_time_delta(h->inst->parent->revive_interval));
-               fr_trunk_connection_signal_inactive(tconn);
-               (void) fr_trunk_connection_requests_requeue(tconn, FR_TRUNK_REQUEST_STATE_ALL, 0, false);
-
-               when = now + h->inst->parent->revive_interval;
-               if (fr_event_timer_at(h, el, &h->zombie_ev, when, revive_timer, tconn) < 0) {
-                       fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED);
-                       return true;
-               }
-
-               return true;
-       }
+       if (last_sent && ((last_sent + h->inst->parent->response_window) < now)) return false;
  
         /*
          *      Mark the connection as inactive, but keep sending
          *      packets on it.
          */
         WARN("%s - Entering Zombie state - connection %s", h->module_name, h->name);
-       h->status_checking = true;
-
-       /*
-        *      Move ALL requests to other connections!
-        */
         fr_trunk_connection_signal_inactive(tconn);
-       (void) fr_trunk_connection_requests_requeue(tconn, FR_TRUNK_REQUEST_STATE_ALL, 0, false);
  
-       /*
-        *      Queue up the status check packet.  It will be sent
-        *      when the connection is writable.
-        */
-       h->status_u->retry.start = 0;
-       h->status_r->treq = NULL;
+       if (h->inst->parent->status_check) {
+               h->status_checking = true;
  
-       if (fr_trunk_request_enqueue_on_conn(&h->status_r->treq, tconn, h->status_request,
-                                            h->status_u, h->status_r, true) != FR_TRUNK_ENQUEUE_OK) {
-               fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED);
+               /*
+                *      Queue up the status check packet.  It will be sent
+                *      when the connection is writable.
+                */
+               h->status_u->retry.start = 0;
+               h->status_r->treq = NULL;
+
+               if (fr_trunk_request_enqueue_on_conn(&h->status_r->treq, tconn, h->status_request,
+                                                    h->status_u, h->status_r, true) != FR_TRUNK_ENQUEUE_OK) {
+                       fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED);
+               }
+       } else {
+               if (fr_event_timer_at(h, el, &h->zombie_ev, now + h->inst->parent->zombie_period, zombie_timeout, h) < 0) {
+                       ERROR("Failed inserting zombie timeout for connection");
+                       fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED);
+               }
         }
  
         return true;
  }
  
-/** Handle retries for a request_t
+/** Handle timeouts when a request is being sent synchronously
   *
   */
  static void request_timeout(fr_event_list_t *el, fr_time_t now, void *uctx)
+{
+       fr_trunk_request_t      *treq = talloc_get_type_abort(uctx, fr_trunk_request_t);
+       udp_handle_t            *h;
+       udp_request_t           *u = talloc_get_type_abort(treq->preq, udp_request_t);
+       udp_result_t            *r = talloc_get_type_abort(treq->rctx, udp_result_t);
+       fr_trunk_connection_t   *tconn = treq->tconn;
+
+       fr_assert(treq->state == FR_TRUNK_REQUEST_STATE_SENT);          /* No other states should be timing out */
+       fr_assert(treq->preq);                                          /* Must still have a protocol request */
+       fr_assert(u->rr);
+       fr_assert(tconn);
+
+       h = talloc_get_type_abort(treq->tconn->conn->h, udp_handle_t);
+
+       r->rcode = RLM_MODULE_FAIL;
+       fr_trunk_request_signal_complete(treq);
+
+       fr_assert(!u->status_check);
+
+       check_for_zombie(el, tconn, now, u->retry.start);
+}
+
+/** Handle retries when a request is being sent asynchronously
+ *
+ */
+static void request_retry(UNUSED fr_event_list_t *el, fr_time_t now, void *uctx)
  {
         fr_trunk_request_t      *treq = talloc_get_type_abort(uctx, fr_trunk_request_t);
         udp_handle_t            *h;
@@ -1690,28 +1705,56 @@ static void request_timeout(fr_event_list_t *el, fr_time_t now, void *uctx)
  
         h = talloc_get_type_abort(treq->tconn->conn->h, udp_handle_t);
  
-       if (!u->status_check) {
-               /*
-                *      If the connection just became a zombie
-                *      the request that just timedout will
-                *      have moved back into the trunk backlog,
-                *      been assigned to another connection
-                *      or freed.
-                *
-                *      In any case we must not continue to
-                *      work with it, because we have no idea
-                *      what state its in.
-                */
-               if (check_for_zombie(el, tconn, now)) return;
+       fr_assert(!u->status_check);
  
-       } else {
-               /*
-                *      Reset replies to 0 as we only count
-                *      contiguous, good, replies.
-                */
-               u->num_replies = 0;
+       switch (fr_retry_next(&u->retry, now)) {
+       /*
+        *      Queue the request for retransmission.
+        *
+        *      @todo - set up "next" timer here, instead of in
+        *      request_mux() ?  That way we can catch the case of
+        *      packets sitting in the queue for extended periods of
+        *      time, and still run the timers.
+        */
+       case FR_RETRY_CONTINUE:
+               fr_trunk_request_requeue(treq);
+               return;
+
+       case FR_RETRY_MRD:
+               REDEBUG("Reached maximum_retransmit_duration (%pVs > %pVs), failing request",
+                       fr_box_time_delta(now - u->retry.start), fr_box_time_delta(u->retry.config->mrd));
+               break;
+
+       case FR_RETRY_MRC:
+               REDEBUG("Reached maximum_retransmit_count (%u > %u), failing request",
+                       u->retry.count, u->retry.config->mrc);
+               break;
         }
  
+       r->rcode = RLM_MODULE_FAIL;
+       fr_trunk_request_signal_complete(treq);
+
+       check_for_zombie(el, tconn, now, u->retry.start);
+}
+
+static void status_check_retry(UNUSED fr_event_list_t *el, fr_time_t now, void *uctx)
+{
+       fr_trunk_request_t      *treq = talloc_get_type_abort(uctx, fr_trunk_request_t);
+       udp_handle_t            *h;
+       udp_request_t           *u = talloc_get_type_abort(treq->preq, udp_request_t);
+       udp_result_t            *r = talloc_get_type_abort(treq->rctx, udp_result_t);
+       request_t               *request = treq->request;
+       fr_trunk_connection_t   *tconn = treq->tconn;
+
+       fr_assert(treq->state == FR_TRUNK_REQUEST_STATE_SENT);          /* No other states should be timing out */
+       fr_assert(treq->preq);                                          /* Must still have a protocol request */
+       fr_assert(u->rr);
+       fr_assert(tconn);
+
+       h = talloc_get_type_abort(treq->tconn->conn->h, udp_handle_t);
+
+       fr_assert(u->status_check);
+
         switch (fr_retry_next(&u->retry, now)) {
         /*
          *      Queue the request for retransmission.
@@ -1739,12 +1782,14 @@ static void request_timeout(fr_event_list_t *el, fr_time_t now, void *uctx)
         r->rcode = RLM_MODULE_FAIL;
         fr_trunk_request_signal_complete(treq);
  
-       if (!u->status_check) return;
+        WARN("%s - No response to status check, marking connection as dead - %s", h->module_name, h->name);
  
-       WARN("%s - No response to status check, marking connection as dead - %s", h->module_name, h->name);
-
-       h->status_checking = false;
-       fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED);
+       /*
+        *      We're no longer status checking, reconnect the
+        *      connection.
+        */
+        h->status_checking = false;
+        fr_trunk_connection_signal_reconnect(tconn, FR_CONNECTION_FAILED);
  }
  
  static void request_mux(fr_event_list_t *el,
@@ -1757,10 +1802,10 @@ static void request_mux(fr_event_list_t *el,
         size_t                  total_len = 0;
  
         /*
-        *      If the connection just became a zombie
-        *      don't try and enqueue things on it!
+        *      If the connection is zombie, then don't try to enqueue
+        *      things on it!
          */
-       if (check_for_zombie(el, tconn, 0)) return;
+       if (check_for_zombie(el, tconn, 0, h->last_sent)) return;
  
         /*
          *      Encode multiple packets in preparation
@@ -1963,25 +2008,36 @@ static void request_mux(fr_event_list_t *el,
                         action = "Retransmitted";
                 }
  
-               if (!inst->parent->synchronous) {
+               if (u->status_check) {
+                       RDEBUG("%s status check.  Expecting response within %pVs", action,
+                              fr_box_time_delta(u->retry.rt));
+
+                       if (fr_event_timer_at(u, el, &u->ev, u->retry.next, status_check_retry, treq) < 0) {
+                               RERROR("Failed inserting retransmit timeout for connection");
+                               fr_trunk_request_signal_fail(treq);
+                               continue;
+                       }
+
+               } else if (!inst->parent->synchronous) {
                         RDEBUG("%s request.  Expecting response within %pVs", action,
                                fr_box_time_delta(u->retry.rt));
  
-                       if (fr_event_timer_at(u, el, &u->ev, u->retry.next, request_timeout, treq) < 0) {
+                       if (fr_event_timer_at(u, el, &u->ev, u->retry.next, request_retry, treq) < 0) {
                                 RERROR("Failed inserting retransmit timeout for connection");
                                 fr_trunk_request_signal_fail(treq);
                                 continue;
                         }
-               } else {
+
+               } else if (u->retry.count == 1) {
+                       if (fr_event_timer_at(u, el, &u->ev, u->retry.start + h->inst->parent->response_window, request_timeout, treq) < 0) {
+                               RERROR("Failed inserting timeout for connection");
+                               fr_trunk_request_signal_fail(treq);
+                               continue;
+                       }
+
                         /*
                          *      If the packet doesn't get a response,
                          *      then udp_request_free() will notice, and run conn_zombie()
-                        *
-                        *      @todo - set up a response_window which is LESS
-                        *      than max_request_time.  That way we
-                        *      can return "fail", and process the
-                        *      request through a fail handler,
-                        *      instead of just freeing it.
                          */
                         RDEBUG("%s request.  Relying on NAS to perform more retransmissions", action);
                 }
@@ -2641,13 +2697,19 @@ static void mod_signal(module_ctx_t const *mctx, UNUSED request_t *request,
          */
         case FR_SIGNAL_DUP:
                 /*
-                *      Connection just became a zombie
-                *      we don't want to retransmit
-                *      as the connection is now dead
-                *      and we have no idea what state
-                *      the request is in.
+                *      If we're not synchronous, then rely on
+                *      request_retry() to do the retransmissions.
+                */
+               if (!t->inst->parent->synchronous) return;
+
+               /*
+                *      We are synchronous, retransmit the current
+                *      request on the same connection.
+                *
+                *      If it's zombie, we still resend it.  If the
+                *      connection is dead, then a callback will move
+                *      this request to a new connection.
                  */
-               if (check_for_zombie(t->el, r->treq->tconn, 0)) return;
                 fr_trunk_request_requeue(r->treq);
                 return;
author	Alan T. DeKok <aland@freeradius.org>
	Mon, 23 Aug 2021 19:49:59 +0000 (15:49 -0400)
committer	Alan T. DeKok <aland@freeradius.org>
	Mon, 23 Aug 2021 19:51:46 +0000 (15:51 -0400)