]> git.ipfire.org Git - thirdparty/dhcp.git/commitdiff
[master] Patch the failover code to avoid deadlocks
authorShawn Routhier <sar@isc.org>
Tue, 11 Nov 2014 03:04:13 +0000 (19:04 -0800)
committerShawn Routhier <sar@isc.org>
Tue, 11 Nov 2014 03:04:13 +0000 (19:04 -0800)
Patch for 36810 & 20352
This coves several related problems
1) When the primary is in conflict done it allows the secondary to
transition around resolution interrupted and potentical conflict previously
the primary would die on an illegal state.

2) It allows the servers to restart a bind update request.  Previously if
one of the servers sent an udpate request and there died (or had the communications
interrupted) in some states the first server wouldn't retransmit a new
update request and the other server wouldn't send any bind updates. This
was noticed in potential conflict.

3) Updated the state transitions to move the leases on the ack queue
back to the update queue in case of conflict-done as we might need to
retransmit them all.

4) Updated a transition from startup to potentical conflict instead
of resolution interrupted when the servers reconnect during the startup
phase in order to avoid a diffferent dead lock.

RELNOTES
includes/dhcpd.h
server/failover.c

index a4c36129f40da626aed3552069400e94d7c2651d..93acdfd69bcd9735d70ae816a68d99791677b9c7 100644 (file)
--- a/RELNOTES
+++ b/RELNOTES
@@ -137,6 +137,15 @@ by Eric Young (eay@cryptsoft.com).
   the patch upon which the fix is based.
   [ISC-Bugs #32222]
 
+- In the failover code, handle the case of communications being interrupted
+  when the servers are dealing with POTENTIAL-CONFLICT.  This patch allows
+  the primary to accept the secondary moving from POTENTIAL-CONFLICT to
+  RESOLUTION-INTERRUPTED as well as handling the bind update process better.
+  In addition the code to resend update or update all requests has been
+  modified to send requests more often.
+  [ISC-Bugs #36810]
+  [ISC-Bugs #20352]
+
                        Changes since 4.3.1b1
 
 - Modify the linux and openwrt dhclient scripts to process information
index 2043708f53d716b204aad1e26e21315f147a44aa..9ad641be7506249d9d73d0968d5d1baff17d381e 100644 (file)
@@ -3419,6 +3419,7 @@ isc_result_t dhcp_failover_state_signal (omapi_object_t *,
 isc_result_t dhcp_failover_state_transition (dhcp_failover_state_t *,
                                             const char *);
 isc_result_t dhcp_failover_set_service_state (dhcp_failover_state_t *state);
+void dhcp_failover_rescind_updates (dhcp_failover_state_t *);
 isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *,
                                      enum failover_state);
 isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *,
index eb2612b7da8f98d4c17918294091495ed4245a51..e6d9fdd6af182cb6964052e0cc20f597eb865ca6 100644 (file)
@@ -1519,8 +1519,16 @@ isc_result_t dhcp_failover_state_transition (dhcp_failover_state_t *state,
                      /* In these situations, we remain in the current
                       * state, or if in startup enter those states.
                       */
-                     case communications_interrupted:
                      case conflict_done:
+                       /* As the peer may not have received or may have
+                        * lost track of updates we sent previously we
+                        * rescind them, causing us to retransmit them
+                        * on an update request.
+                        */
+                       dhcp_failover_rescind_updates(state);
+                       /* fall through */
+
+                     case communications_interrupted:
                      case partner_down:
                      case paused:
                      case recover:
@@ -1703,6 +1711,52 @@ isc_result_t dhcp_failover_set_service_state (dhcp_failover_state_t *state)
        return ISC_R_SUCCESS;
 }
 
+/*!
+ * \brief Return any leases on the ack queue back to the update queue
+ *
+ * Re-schedule any pending updates by moving them from the ack queue
+ * (update sent awaiting response) back to the update queue (need to
+ * send an update for this lease).  This will result in a retransmission
+ * of the update.
+ *
+ * \param state is the state block for the failover connection we are
+ * updating.
+ */
+
+void dhcp_failover_rescind_updates (dhcp_failover_state_t *state)
+{
+    struct lease *lp;
+
+    if (state->ack_queue_tail == NULL)
+           return;
+
+    /* Zap the flags. */
+    for (lp = state->ack_queue_head; lp; lp = lp->next_pending)
+           lp->flags = ((lp->flags & ~ON_ACK_QUEUE) | ON_UPDATE_QUEUE);
+
+    /* Now hook the ack queue to the beginning of the update queue. */
+    if (state->update_queue_head) {
+           lease_reference(&state->ack_queue_tail->next_pending,
+                           state->update_queue_head, MDL);
+           lease_dereference(&state->update_queue_head, MDL);
+    }
+    lease_reference(&state->update_queue_head, state->ack_queue_head, MDL);
+
+    if (!state->update_queue_tail) {
+#if defined (POINTER_DEBUG)
+           if (state->ack_queue_tail->next_pending) {
+                   log_error("next pending on ack queue tail.");
+                   abort();
+           }
+#endif
+           lease_reference(&state->update_queue_tail,
+                           state->ack_queue_tail, MDL);
+    }
+    lease_dereference(&state->ack_queue_tail, MDL);
+    lease_dereference(&state->ack_queue_head, MDL);
+    state->cur_unacked_updates = 0;
+}
+
 isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *state,
                                      enum failover_state new_state)
 {
@@ -1721,37 +1775,9 @@ isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *state,
       case normal:
       case potential_conflict:
       case partner_down:
-       if (state -> ack_queue_tail) {
-           struct lease *lp;
-               
-           /* Zap the flags. */
-           for (lp = state -> ack_queue_head; lp; lp = lp -> next_pending)
-                   lp -> flags = ((lp -> flags & ~ON_ACK_QUEUE) |
-                                  ON_UPDATE_QUEUE);
-
-           /* Now hook the ack queue to the beginning of the update
-              queue. */
-           if (state -> update_queue_head) {
-               lease_reference (&state -> ack_queue_tail -> next_pending,
-                                state -> update_queue_head, MDL);
-               lease_dereference (&state -> update_queue_head, MDL);
-           }
-           lease_reference (&state -> update_queue_head,
-                            state -> ack_queue_head, MDL);
-           if (!state -> update_queue_tail) {
-#if defined (POINTER_DEBUG)
-               if (state -> ack_queue_tail -> next_pending) {
-                   log_error ("next pending on ack queue tail.");
-                   abort ();
-               }
-#endif
-               lease_reference (&state -> update_queue_tail,
-                                state -> ack_queue_tail, MDL);
-           }
-           lease_dereference (&state -> ack_queue_tail, MDL);
-           lease_dereference (&state -> ack_queue_head, MDL);
-           state -> cur_unacked_updates = 0;
-       }
+       /* Move the ack queue to the update queue */
+       dhcp_failover_rescind_updates(state);
+
        /* We will re-queue a timeout later, if applicable. */
        cancel_timeout (dhcp_failover_keepalive, state);
        break;
@@ -1859,7 +1885,9 @@ isc_result_t dhcp_failover_set_state (dhcp_failover_state_t *state,
            break;
 
          case potential_conflict:
-           if (state -> i_am == primary)
+           if ((state->i_am == primary) ||
+               ((state->i_am == secondary) &&
+                (state->partner.state == conflict_done)))
                    dhcp_failover_send_update_request (state);
            break;
 
@@ -1960,7 +1988,18 @@ isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *state,
        if (state -> partner.state == new_state && state -> me.state) {
                switch (state -> me.state) {
                      case startup:
-                       dhcp_failover_set_state (state, state -> saved_state);
+                       /*
+                        * If we have a peer state we must be connected.
+                        * If so we should move to potential_conflict
+                        * instead of resolution_interrupted, otherwise
+                        * back to whereever we were before we stopped.
+                        */
+                       if (state->saved_state == resolution_interrupted)
+                               dhcp_failover_set_state(state,
+                                                       potential_conflict);
+                       else 
+                               dhcp_failover_set_state(state,
+                                                       state->saved_state);
                        return ISC_R_SUCCESS;
 
                      case unknown_state:
@@ -2183,6 +2222,17 @@ isc_result_t dhcp_failover_peer_state_changed (dhcp_failover_state_t *state,
                        dhcp_failover_set_state(state, new_state);
                        break;
 
+                     case potential_conflict:
+                     case resolution_interrupted:
+                       /*
+                        * This can happen when the connection is lost and 
+                        * recovered after the primary has moved to 
+                        * conflict-done but the secondary is still in 
+                        * potential-conflict.  In that case, we have to 
+                        * remain in conflict-done.
+                        */
+                       break;
+
                      default:
                        log_fatal("Peer %s: Invalid attempt to move from %s "
                                "to %s while local state is conflict-done.",
@@ -4861,35 +4911,42 @@ isc_result_t dhcp_failover_send_update_request (dhcp_failover_state_t *state)
 # define FMA (char *)0, (unsigned *)0, 0
 #endif
 
-       if (!state -> link_to_peer ||
-           state -> link_to_peer -> type != dhcp_type_failover_link)
-               return DHCP_R_INVALIDARG;
-       link = (dhcp_failover_link_t *)state -> link_to_peer;
+       if (!state->link_to_peer ||
+           state->link_to_peer->type != dhcp_type_failover_link)
+               return (DHCP_R_INVALIDARG);
+       link = (dhcp_failover_link_t *)state->link_to_peer;
 
-       if (!link -> outer || link -> outer -> type != omapi_type_connection)
-               return DHCP_R_INVALIDARG;
+       if (!link->outer || link->outer->type != omapi_type_connection)
+               return (DHCP_R_INVALIDARG);
 
-       if (state -> curUPD)
-               return ISC_R_ALREADYRUNNING;
+       /* We allow an update to be restarted in case we requested an update
+        * and were interrupted by something. If we had an ALL going we need
+        * to restart that.  Otherwise we simply continue with the request */
+       if (state->curUPD == FTM_UPDREQALL) {
+               return (dhcp_failover_send_update_request_all(state));
+       }
 
-       status = (dhcp_failover_put_message
-                 (link, link -> outer,
-                  FTM_UPDREQ, link->xid++,
-                  (failover_option_t *)0));
+       status = (dhcp_failover_put_message(link, link->outer, FTM_UPDREQ,
+                                           link->xid++, NULL));
 
-       if (status == ISC_R_SUCCESS)
-               state -> curUPD = FTM_UPDREQ;
+       state->curUPD = FTM_UPDREQ;
 
 #if defined (DEBUG_FAILOVER_MESSAGES)
        if (status != ISC_R_SUCCESS)
-               failover_print (FMA, " (failed)");
-       failover_print (FMA, ")");
+               failover_print(FMA, " (failed)");
+       failover_print(FMA, ")");
        if (obufix) {
-               log_debug ("%s", obuf);
+               log_debug("%s", obuf);
        }
 #endif
-       log_info ("Sent update request message to %s", state -> name);
-       return status;
+
+       if (status == ISC_R_SUCCESS) {
+               log_info("Sent update request message to %s", state->name);
+       } else {
+               log_error("Failed to send update request all message to %s: %s",
+                        state->name, isc_result_totext(status));
+       }
+       return (status);
 }
 
 isc_result_t dhcp_failover_send_update_request_all (dhcp_failover_state_t
@@ -4907,36 +4964,39 @@ isc_result_t dhcp_failover_send_update_request_all (dhcp_failover_state_t
 # define FMA (char *)0, (unsigned *)0, 0
 #endif
 
-       if (!state -> link_to_peer ||
-           state -> link_to_peer -> type != dhcp_type_failover_link)
-               return DHCP_R_INVALIDARG;
-       link = (dhcp_failover_link_t *)state -> link_to_peer;
+       if (!state->link_to_peer ||
+           state->link_to_peer->type != dhcp_type_failover_link)
+               return (DHCP_R_INVALIDARG);
+       link = (dhcp_failover_link_t *)state->link_to_peer;
 
-       if (!link -> outer || link -> outer -> type != omapi_type_connection)
-               return DHCP_R_INVALIDARG;
+       if (!link->outer || link->outer->type != omapi_type_connection)
+               return (DHCP_R_INVALIDARG);
 
-       /* If there is an UPDREQ in progress, then upgrade to UPDREQALL. */
-       if (state -> curUPD && (state -> curUPD != FTM_UPDREQ))
-               return ISC_R_ALREADYRUNNING;
+       /* We allow an update to be restarted in case we requested an update
+        * and were interrupted by something.
+        */
 
-       status = (dhcp_failover_put_message
-                 (link, link -> outer,
-                  FTM_UPDREQALL, link->xid++,
-                  (failover_option_t *)0));
+       status = (dhcp_failover_put_message(link, link->outer, FTM_UPDREQALL,
+                                           link->xid++, NULL));
 
-       if (status == ISC_R_SUCCESS)
-               state -> curUPD = FTM_UPDREQALL;
+       state->curUPD = FTM_UPDREQALL;
 
 #if defined (DEBUG_FAILOVER_MESSAGES)
        if (status != ISC_R_SUCCESS)
-               failover_print (FMA, " (failed)");
-       failover_print (FMA, ")");
+               failover_print(FMA, " (failed)");
+       failover_print(FMA, ")");
        if (obufix) {
-               log_debug ("%s", obuf);
+               log_debug("%s", obuf);
        }
 #endif
-       log_info ("Sent update request all message to %s", state -> name);
-       return status;
+
+       if (status == ISC_R_SUCCESS) {
+               log_info("Sent update request all message to %s", state->name);
+       } else {
+               log_error("Failed to send update request all message to %s: %s",
+                        state->name, isc_result_totext(status));
+       }
+       return (status);
 }
 
 isc_result_t dhcp_failover_send_update_done (dhcp_failover_state_t *state)