]> git.ipfire.org Git - thirdparty/kea.git/commitdiff
[#3250] Delayed transition to terminated
authorMarcin Siodelski <marcin@isc.org>
Wed, 27 Mar 2024 12:22:12 +0000 (13:22 +0100)
committerMarcin Siodelski <marcin@isc.org>
Tue, 16 Apr 2024 11:38:01 +0000 (13:38 +0200)
src/hooks/dhcp/high_availability/communication_state.cc
src/hooks/dhcp/high_availability/communication_state.h
src/hooks/dhcp/high_availability/ha_messages.cc
src/hooks/dhcp/high_availability/ha_messages.h
src/hooks/dhcp/high_availability/ha_messages.mes
src/hooks/dhcp/high_availability/ha_service.cc
src/hooks/dhcp/high_availability/ha_service.h
src/hooks/dhcp/high_availability/tests/communication_state_unittest.cc
src/hooks/dhcp/high_availability/tests/ha_service_unittest.cc
src/hooks/dhcp/high_availability/tests/ha_test.h

index 38b7566df3a2faacbd881aa14bf38db8dc4c7fa4..39cc1ccbc4885e3ae6b6c2889f0d460bb1e3be76 100644 (file)
@@ -58,8 +58,8 @@ CommunicationState::CommunicationState(const IOServicePtr& io_service,
                                        const HAConfigPtr& config)
     : io_service_(io_service), config_(config), timer_(), interval_(0),
       poke_time_(boost::posix_time::microsec_clock::universal_time()),
-      heartbeat_impl_(0), partner_state_(-1), partner_scopes_(),
-      clock_skew_(0, 0, 0, 0), last_clock_skew_warn_(),
+      heartbeat_impl_(0), partner_state_(-1), partner_state_time_(),
+      partner_scopes_(), clock_skew_(0, 0, 0, 0), last_clock_skew_warn_(),
       my_time_at_skew_(), partner_time_at_skew_(),
       analyzed_messages_count_(0), unsent_update_count_(0),
       partner_unsent_update_count_{0, 0}, mutex_(new mutex()) {
@@ -114,13 +114,33 @@ CommunicationState::setPartnerUnavailable() {
 void
 CommunicationState::setPartnerStateInternal(const std::string& state) {
     try {
-        partner_state_ = stringToState(state);
+        auto new_partner_state = stringToState(state);
+        if (new_partner_state != partner_state_) {
+            setCurrentPartnerStateTimeInternal();
+        }
+        partner_state_ = new_partner_state;
     } catch (...) {
         isc_throw(BadValue, "unsupported HA partner state returned "
                   << state);
     }
 }
 
+time_duration
+CommunicationState::getDurationSincePartnerStateTime() const {
+    ptime now = boost::posix_time::microsec_clock::universal_time();
+    if (MultiThreadingMgr::instance().getMode()) {
+        std::lock_guard<std::mutex> lk(*mutex_);
+        return (now - partner_state_time_);
+    } else {
+        return (now - partner_state_time_);
+    }
+}
+
+void
+CommunicationState::setCurrentPartnerStateTimeInternal() {
+    partner_state_time_ = boost::posix_time::microsec_clock::universal_time();
+}
+
 std::set<std::string>
 CommunicationState::getPartnerScopes() const {
     if (MultiThreadingMgr::instance().getMode()) {
index 2ca01db97d0c157d492c9e2bac4310f4a79ce386..9bdaa78656139c00f54b470572121fb9789c2e8d 100644 (file)
@@ -123,6 +123,22 @@ private:
     void setPartnerStateInternal(const std::string& state);
 
 public:
+
+    /// @brief Returns the duration since the partner was first seen in the
+    /// current state.
+    ///
+    /// @return A duration since the partner has been in the current state.
+    boost::posix_time::time_duration getDurationSincePartnerStateTime() const;
+
+private:
+
+    /// @brief Sets partner's state time to current time.
+    ///
+    /// It marks the time when the partner was first seen in the current state.
+    void setCurrentPartnerStateTimeInternal();
+
+public:
+
     /// @brief Returns scopes served by the partner server.
     ///
     /// @return A set of scopes served by the partner.
@@ -706,6 +722,9 @@ protected:
     /// Negative value means that the partner's state is unknown.
     int partner_state_;
 
+    /// Holds a time when partner was first seen in the current state.
+    boost::posix_time::ptime partner_state_time_;
+
     /// @brief Last known set of scopes served by the partner server.
     std::set<std::string> partner_scopes_;
 
index e59517b97e2cade1c6d4635843eb64cc7cbe59bf..b3f6377dd7a0d61f0fcef2627e813f53bf143ce2 100644 (file)
@@ -123,6 +123,7 @@ extern const isc::log::MessageID HA_SYNC_HANDLER_FAILED = "HA_SYNC_HANDLER_FAILE
 extern const isc::log::MessageID HA_SYNC_START = "HA_SYNC_START";
 extern const isc::log::MessageID HA_SYNC_SUCCESSFUL = "HA_SYNC_SUCCESSFUL";
 extern const isc::log::MessageID HA_TERMINATED = "HA_TERMINATED";
+extern const isc::log::MessageID HA_TERMINATED_PARTNER_DID_NOT_RESTART = "HA_TERMINATED_PARTNER_DID_NOT_RESTART";
 extern const isc::log::MessageID HA_TERMINATED_RESTART_PARTNER = "HA_TERMINATED_RESTART_PARTNER";
 
 } // namespace ha
@@ -247,6 +248,7 @@ const char* values[] = {
     "HA_SYNC_START", "%1: starting lease database synchronization with %2",
     "HA_SYNC_SUCCESSFUL", "%1: lease database synchronization with %2 completed successfully in %3",
     "HA_TERMINATED", "HA %1: service terminated due to an unrecoverable condition. Check previous error message(s), address the problem and restart!",
+    "HA_TERMINATED_PARTNER_DID_NOT_RESTART", "%1: service is terminating because the terminated partner was not restarted within %2 minutes",
     "HA_TERMINATED_RESTART_PARTNER", "%1: waiting for the partner in the TERMINATED state to be restarted",
     NULL
 };
index 8a0c405c20561587e91803f21e93036bed151d90..c2597a8d4f37059ebed696928f51f01450a84026 100644 (file)
@@ -124,6 +124,7 @@ extern const isc::log::MessageID HA_SYNC_HANDLER_FAILED;
 extern const isc::log::MessageID HA_SYNC_START;
 extern const isc::log::MessageID HA_SYNC_SUCCESSFUL;
 extern const isc::log::MessageID HA_TERMINATED;
+extern const isc::log::MessageID HA_TERMINATED_PARTNER_DID_NOT_RESTART;
 extern const isc::log::MessageID HA_TERMINATED_RESTART_PARTNER;
 
 } // namespace ha
index b258188243cdf9beeb132e3171dcea78cef9e067..7611ec87ee94103e93b75ee6252e20f36fa886fe 100644 (file)
@@ -749,3 +749,13 @@ This informational message is issued when the server has been restarted after
 correcting the clock skew. The partner server is still in the terminated
 state. The partner must be restarted before the server can synchronize the
 database and start normal operation.
+
+% HA_TERMINATED_PARTNER_DID_NOT_RESTART %1: service is terminating because the terminated partner was not restarted within %2 minutes
+This warning message is issued to indicate that the HA service is terminating
+because partner server is in the terminated state and was not restarted within
+an expacted time frame. The terminated servers should be restarted after correcting
+the problem that caused the termination. They can be restarted sequentially but the
+duration between the restarts should not be too long. If it is long it may mean that
+the restart of one of the servers was unintentional (e.g., power outage). If the
+restarted server remains in the waiting state it cannot serve DHCP clients. Moving
+to the terminated state at least allows for responding to the DHCP traffic.
index 9b00b3fb9910d38b8e75f1a544275d11d29a6bac..db49b98f4a48e377bec52438e5cb142401e57ade 100644 (file)
@@ -70,6 +70,7 @@ const int HAService::HA_MAINTENANCE_START_EVT;
 const int HAService::HA_MAINTENANCE_CANCEL_EVT;
 const int HAService::HA_CONTROL_RESULT_MAINTENANCE_NOT_ALLOWED;
 const int HAService::HA_SYNCED_PARTNER_UNAVAILABLE_EVT;
+const int HAService::HA_WAITING_TO_TERMINATED_ST_DELAY_MINUTES;
 
 HAService::HAService(const unsigned int id, const IOServicePtr& io_service,
                      const NetworkStatePtr& network_state, const HAConfigPtr& config,
@@ -884,7 +885,17 @@ HAService::waitingStateHandler() {
         postNextEvent(NOP_EVT);
         break;
 
-    case HA_TERMINATED_ST:
+    case HA_TERMINATED_ST: {
+        auto partner_in_terminated = communication_state_->getDurationSincePartnerStateTime();
+        if (!partner_in_terminated.is_not_a_date_time() &&
+            partner_in_terminated.minutes() >= HA_WAITING_TO_TERMINATED_ST_DELAY_MINUTES) {
+            LOG_WARN(ha_logger, HA_TERMINATED_PARTNER_DID_NOT_RESTART)
+                .arg(config_->getThisServerName())
+                .arg(HA_WAITING_TO_TERMINATED_ST_DELAY_MINUTES);
+            verboseTransition(HA_TERMINATED_ST);
+            break;
+        }
+
         // We have checked above whether the clock skew is exceeding the threshold
         // and we should terminate. If we're here, it means that the clock skew
         // is acceptable. The partner may be still in the terminated state because
@@ -895,7 +906,7 @@ HAService::waitingStateHandler() {
             .arg(config_->getThisServerName());
         postNextEvent(NOP_EVT);
         break;
-
+    }
     case HA_WAITING_ST:
         // If both servers are waiting, the primary server 'wins' and is
         // transitioned to the next state first.
index 681b1d23f89e12234fd459ee873931ee5e3de49e..8d677bdc635f8c50d15a72953824293aba1a68db 100644 (file)
@@ -80,6 +80,10 @@ public:
     /// Control result returned in response to ha-maintenance-notify.
     static const int HA_CONTROL_RESULT_MAINTENANCE_NOT_ALLOWED = 1001;
 
+    /// A delay in minutes to transition from the waiting to terminated state
+    /// when the partner remains in terminated state.
+    static const int HA_WAITING_TO_TERMINATED_ST_DELAY_MINUTES = 10;
+
 protected:
 
     /// @brief Callback invoked when request was sent and a response received
index 519d58d43f8f39411aa4080403f1fb0f67e41b96..19ccc58e08a89698b6aad5eae53280e2ef2984ad 100644 (file)
@@ -65,6 +65,14 @@ public:
     /// tested in a different test case.
     void partnerStateUnavailableTest();
 
+    /// @brief Verifies that the duration since the partner time is updated
+    /// correctly when the partner has certain state.
+    void partnerStateTimeExplicitStateTest();
+
+    /// @brief Verifies that the duration since the partner time is updated
+    /// correctly when the partner is unavailable.
+    void partnerStateTimeUnavailableTest();
+
     /// @brief Verifies that the partner's scopes are set and retrieved correctly.
     void partnerScopesTest();
 
@@ -208,6 +216,64 @@ CommunicationStateTest::partnerStateTest() {
     EXPECT_THROW(state_.setPartnerState("unsupported"), BadValue);
 }
 
+// Verifies that the duration since the partner time is updated
+// correctly when the partner has certain state.
+void
+CommunicationStateTest::partnerStateTimeExplicitStateTest() {
+    // Initially the partner state time is not set.
+    time_duration duration_since_partner_time = state_.getDurationSincePartnerStateTime();
+    EXPECT_TRUE(duration_since_partner_time.is_not_a_date_time());
+
+    // Set new partner state. The state time should be set close
+    // to the current time.
+    state_.setPartnerState("waiting");
+
+    // Sleep for a short period of time to ensure that the returned
+    // duration is non-zero.
+    usleep(10);
+
+    // The duration must be set.
+    time_duration duration_since_waiting = state_.getDurationSincePartnerStateTime();
+    ASSERT_FALSE(duration_since_waiting.is_not_a_date_time());
+
+    // Check that the partner time is within 10 seconds range from
+    // the current time. 10 seconds difference may sound too high but
+    // this test is not about the precision. It is more important that
+    // it is stable. In the HA state machine the duration of 10 seconds
+    // is relatively low anyway.
+    EXPECT_GT(duration_since_waiting.fractional_seconds(), 0);
+    EXPECT_LT(duration_since_waiting.seconds(), 10);
+}
+
+// Verifies that the duration since the partner time is updated
+// correctly when the partner is unavailable.
+void
+CommunicationStateTest::partnerStateTimeUnavailableTest() {
+    // Initially the partner state time is not set.
+    time_duration duration_since_partner_time = state_.getDurationSincePartnerStateTime();
+    EXPECT_TRUE(duration_since_partner_time.is_not_a_date_time());
+
+    // Set new partner state. The state time should be set close
+    // to the current time.
+    state_.setPartnerUnavailable();
+
+    // Sleep for a short period of time to ensure that the returned
+    // duration is non-zero.
+    usleep(10);
+
+    // The duration must be set.
+    time_duration duration_since_unavail = state_.getDurationSincePartnerStateTime();
+    ASSERT_FALSE(duration_since_unavail.is_not_a_date_time());
+
+    // Check that the partner time is within 10 seconds range from
+    // the current time. 10 seconds difference may sound too high but
+    // this test is not about the precision. It is more important that
+    // it is stable. In the HA state machine the duration of 10 seconds
+    // is relatively low anyway.
+    EXPECT_GT(duration_since_unavail.fractional_seconds(), 0);
+    EXPECT_LT(duration_since_unavail.seconds(), 10);
+}
+
 // Verifies that the partner state is set to unavailable.
 void
 CommunicationStateTest::partnerStateUnavailableTest() {
@@ -987,6 +1053,24 @@ TEST_F(CommunicationStateTest, partnerStateUnavailableTestMultiThreading) {
     partnerStateUnavailableTest();
 }
 
+TEST_F(CommunicationStateTest, partnerStateTimeExplicitStateTest) {
+    partnerStateTimeExplicitStateTest();
+}
+
+TEST_F(CommunicationStateTest, partnerStateTimeExplcitStateTestMultiThreading) {
+    MultiThreadingMgr::instance().setMode(true);
+    partnerStateTimeExplicitStateTest();
+}
+
+TEST_F(CommunicationStateTest, partnerStateTimeUnavailableTest) {
+    partnerStateTimeUnavailableTest();
+}
+
+TEST_F(CommunicationStateTest, partnerStateTimeUnavailableTestMultiThreading) {
+    MultiThreadingMgr::instance().setMode(true);
+    partnerStateTimeUnavailableTest();
+}
+
 TEST_F(CommunicationStateTest, partnerScopesTest) {
     partnerScopesTest();
 }
index 1198f82e6c8e991a45cf9dcbc89a0b4280921ffe..21c0c5de0d3c56c7b46206fb4848514c89872b04 100644 (file)
@@ -6912,6 +6912,53 @@ TEST_F(HAServiceStateMachineTest, terminateNoTransitionOnRestart) {
         << "' state";
 }
 
+// This test checks that the server transitions back to the terminated state
+// from the waiting state when the partner sits in the terminated state.
+TEST_F(HAServiceStateMachineTest, terminateWhenPartnerRemainsTerminated) {
+    auto config_storage = createValidConfiguration();
+    partner_->startup();
+    startService(config_storage);
+    NakedCommunicationState4Ptr state(new NakedCommunicationState4(io_service_,
+                                                                   config_storage));
+    service_->communication_state_ = state;
+
+    // Set partner's time to the current time. This guarantees that the clock
+    // skew is below 60s and there is no reason for the server to transition
+    // to the terminated state.
+    partner_->setDateTime(HttpDateTime().rfc1123Format());
+    // The partner is in the terminated state to simulate sequential restart
+    // of the two servers from the terminated state.
+    partner_->transition("terminated");
+    // This server is in the waiting state which simulates the restart case.
+    service_->transition(HA_WAITING_ST, HAService::NOP_EVT);
+    // Run the heartbeat.
+    waitForEvent(HAService::HA_HEARTBEAT_COMPLETE_EVT);
+    // The server should remain in the waiting state because the clock skew
+    // is low.
+    EXPECT_EQ(HA_WAITING_ST, service_->getCurrState())
+        << "expected that the server remains in 'waiting' state"
+        << "', but transitioned to the '"
+        << service_->getStateLabel(service_->getCurrState())
+        << "' state";
+
+    // Move the partner state time to long ago. It should cause our server to
+    // assume that the administrator neglected to restart the terminated partner
+    // and transition to the terminated state to continue serving the DHCP
+    // clients.
+    boost::posix_time::ptime long_ago = HttpDateTime().fromRfc1123("Sun, 06 Nov 1994 08:49:37 GMT").getPtime();
+    state->partner_state_time_ = long_ago;
+
+    // Run the heartbeat.
+    waitForEvent(HAService::HA_HEARTBEAT_COMPLETE_EVT);
+
+    // The server should have transitioned to the terminated state.
+    EXPECT_EQ(HA_TERMINATED_ST, service_->getCurrState())
+        << "expected that the server transitions to the 'terminated' state"
+        << "', but transitioned to the '"
+        << service_->getStateLabel(service_->getCurrState())
+        << "' state";
+}
+
 // This test checks all combinations of server and partner states and the
 // resulting state to which the server transitions. This server is secondary.
 // There is another test which validates state transitions from the
index 8f52190fc0532569029b1d4df6147c7f3b8b78f3..911ff9cb876c9e87a6657ae6797a4437232bf87d 100644 (file)
@@ -61,6 +61,7 @@ public:
     using StateType::partner_time_at_skew_;
     using StateType::unsent_update_count_;
     using StateType::getRejectedLeaseUpdatesCountFromContainer;
+    using StateType::partner_state_time_;
 };
 
 /// @brief Type of the NakedCommunicationState for DHCPv4.